In [21]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("../data/raw/org.csv")
print(df.head())
print(df.info())


   Index  Organization Id                     Name  \
0      1  FAB0d41d5b5d22c              Ferrell LLC   
1      2  6A7EdDEA9FaDC52  Mckinney, Riley and Day   
2      3  0bFED1ADAE4bcC1               Hester Ltd   
3      4  2bFC1Be8a4ce42f           Holder-Sellers   
4      5  9eE8A6a4Eb96C24              Mayer Group   

                          Website           Country  \
0              https://price.net/  Papua New Guinea   
1  http://www.hall-buchanan.info/           Finland   
2       http://sullivan-reed.com/             China   
3             https://becker.com/      Turkmenistan   
4          http://www.brewer.com/         Mauritius   

                                      Description  Founded  \
0             Horizontal empowering knowledgebase     1990   
1             User-centric system-worthy leverage     2015   
2                  Switchable scalable moratorium     1971   
3  De-engineered systemic artificial intelligence     2004   
4              Synchronized needs-

In [22]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Index                100 non-null    int64 
 1   Organization Id      100 non-null    object
 2   Name                 100 non-null    object
 3   Website              100 non-null    object
 4   Country              100 non-null    object
 5   Description          100 non-null    object
 6   Founded              100 non-null    int64 
 7   Industry             100 non-null    object
 8   Number of employees  100 non-null    int64 
dtypes: int64(3), object(6)
memory usage: 7.2+ KB
None


In [23]:
df.isnull().sum()



Index                  0
Organization Id        0
Name                   0
Website                0
Country                0
Description            0
Founded                0
Industry               0
Number of employees    0
dtype: int64

In [26]:
# Remove duplicate rows
df = df.drop_duplicates()

# Handle missing values
df = df.fillna("Unknown")

df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

df.head()

Unnamed: 0,index,organization_id,name,website,country,description,founded,industry,number_of_employees
0,1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498
1,2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952
2,3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287
3,4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921
4,5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870


In [28]:
df["founded"] = pd.to_numeric(df["founded"], errors="coerce")
df["number_of_employees"] = pd.to_numeric(df["number_of_employees"], errors="coerce")
df["website_domain"] = df["website"].str.replace("https://", "") \
                                      .str.replace("http://", "") \
                                      .str.split("/").str[0]
current_year = 2024
df["company_age"] = current_year - df["founded"]
df.head()


Unnamed: 0,index,organization_id,name,website,country,description,founded,industry,number_of_employees,website_domain,company_age
0,1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498,price.net,34
1,2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952,www.hall-buchanan.info,9
2,3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287,sullivan-reed.com,53
3,4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921,becker.com,20
4,5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870,www.brewer.com,33


In [30]:
scaler = MinMaxScaler()

df["employees_normalized"] = scaler.fit_transform(
    df[["number_of_employees"]]
)
df.head()


Unnamed: 0,index,organization_id,name,website,country,description,founded,industry,number_of_employees,website_domain,company_age,employees_normalized
0,1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498,price.net,34,0.334256
1,2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952,www.hall-buchanan.info,9,0.483246
2,3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287,sullivan-reed.com,53,0.517574
3,4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921,becker.com,20,0.070192
4,5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870,www.brewer.com,33,0.782252


In [36]:
def classify_company(size):
    if size >= 8000:
        return "Large"
    elif size >= 3000:
        return "Medium"
    else:
        return "Small"

df["company_size"] = df["number_of_employees"].apply(classify_company)

def age_category(age):
    if age >= 40:
        return "Old"
    elif age >= 20:
        return "Established"
    else:
        return "Startup"

df["age_category"] = df["company_age"].apply(age_category)
df.head()



Unnamed: 0,index,organization_id,name,website,country,description,founded,industry,number_of_employees,website_domain,company_age,employees_normalized,company_size,age_category
0,1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498,price.net,34,0.334256,Medium,Established
1,2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952,www.hall-buchanan.info,9,0.483246,Medium,Startup
2,3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287,sullivan-reed.com,53,0.517574,Medium,Old
3,4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921,becker.com,20,0.070192,Small,Established
4,5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870,www.brewer.com,33,0.782252,Medium,Established


In [37]:
df.to_csv("processed_companies.csv", index=False)
df


Unnamed: 0,index,organization_id,name,website,country,description,founded,industry,number_of_employees,website_domain,company_age,employees_normalized,company_size,age_category
0,1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498,price.net,34,0.334256,Medium,Established
1,2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952,www.hall-buchanan.info,9,0.483246,Medium,Startup
2,3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287,sullivan-reed.com,53,0.517574,Medium,Old
3,4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921,becker.com,20,0.070192,Small,Established
4,5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870,www.brewer.com,33,0.782252,Medium,Established
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,0a0bfFbBbB8eC7c,Holmes Group,https://mcdowell.org/,Ethiopia,Right-sized zero tolerance focus group,1975,Photography,2988,mcdowell.org,49,0.281996,Small,Old
96,97,BA6Cd9Dae2Efd62,Good Ltd,http://duffy.com/,Anguilla,Reverse-engineered composite moratorium,1971,Consumer Services,4292,duffy.com,53,0.415616,Medium,Old
97,98,E7df80C60Abd7f9,Clements-Espinoza,http://www.flowers.net/,Falkland Islands (Malvinas),Progressive modular hub,1991,Broadcast Media,236,www.flowers.net,33,0.000000,Small,Established
98,99,AFc285dbE2fEd24,Mendez Inc,https://www.burke.net/,Kyrgyz Republic,User-friendly exuding migration,1993,Education Management,339,www.burke.net,31,0.010554,Small,Established
