In [47]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report

In [48]:
df = pd.read_csv('Resources/super_clean_job.csv')

In [49]:
df.dtypes

Job Title             object
Salary Estimate       object
Rating               float64
Company Name          object
Location              object
Headquarters          object
Size                  object
Type of ownership     object
Industry              object
Sector                object
Revenue               object
dtype: object

In [50]:
df.keys()

Index(['Job Title', 'Salary Estimate', 'Rating', 'Company Name', 'Location',
       'Headquarters', 'Size', 'Type of ownership', 'Industry', 'Sector',
       'Revenue'],
      dtype='object')

In [51]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),3.2,Vera Institute of Justice\n3.2,"New York, NY","New York, NY",201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,$100 to $500 million (USD)
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),3.8,Visiting Nurse Service of New York\n3.8,"New York, NY","New York, NY",10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD)
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),3.4,Squarespace\n3.4,"New York, NY","New York, NY",1001 to 5000 employees,Company - Private,Internet,Information Technology,Unknown / Non-Applicable
3,Data Analyst,$37K-$66K (Glassdoor est.),4.1,Celerity\n4.1,"New York, NY","McLean, VA",201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,$50 to $100 million (USD)
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),3.9,FanDuel\n3.9,"New York, NY","New York, NY",501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",$100 to $500 million (USD)


In [52]:
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column Job Title has 2 null values
Column Salary Estimate has 2 null values
Column Rating has 2 null values
Column Company Name has 2 null values
Column Location has 2 null values
Column Headquarters has 2 null values
Column Size has 2 null values
Column Type of ownership has 2 null values
Column Industry has 2 null values
Column Sector has 2 null values
Column Revenue has 2 null values


In [53]:
df.dropna(inplace=True)

In [54]:
df = df.reset_index()

In [55]:
X = df[['Rating','Location','Size','Type of ownership','Industry','Sector','Revenue']]
X

Unnamed: 0,Rating,Location,Size,Type of ownership,Industry,Sector,Revenue
0,3.2,"New York, NY",201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,$100 to $500 million (USD)
1,3.8,"New York, NY",10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD)
2,3.4,"New York, NY",1001 to 5000 employees,Company - Private,Internet,Information Technology,Unknown / Non-Applicable
3,4.1,"New York, NY",201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,$50 to $100 million (USD)
4,3.9,"New York, NY",501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",$100 to $500 million (USD)
...,...,...,...,...,...,...,...
1849,4.1,"Broomfield, CO",51 to 200 employees,Company - Private,Computer Hardware & Software,Information Technology,$25 to $50 million (USD)
1850,2.5,"Denver, CO",51 to 200 employees,Company - Private,Staffing & Outsourcing,Business Services,Unknown / Non-Applicable
1851,2.9,"Centennial, CO",10000+ employees,Company - Public,Wholesale,Business Services,$10+ billion (USD)
1852,3.1,"Centennial, CO",201 to 500 employees,Company - Private,Enterprise Software & Network Solutions,Information Technology,$25 to $50 million (USD)


In [56]:
y = df['Salary Estimate']
y

0        $37K-$66K (Glassdoor est.)
1        $37K-$66K (Glassdoor est.)
2        $37K-$66K (Glassdoor est.)
3        $37K-$66K (Glassdoor est.)
4        $37K-$66K (Glassdoor est.)
                   ...             
1849    $78K-$104K (Glassdoor est.)
1850    $78K-$104K (Glassdoor est.)
1851    $78K-$104K (Glassdoor est.)
1852    $78K-$104K (Glassdoor est.)
1853    $78K-$104K (Glassdoor est.)
Name: Salary Estimate, Length: 1854, dtype: object

In [57]:
# One-hot encoding the entire dataframe
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['Rating', 'Location_Addison, TX', 'Location_Alachua, FL',
       'Location_Alameda, CA', 'Location_Alhambra, CA',
       'Location_Allegheny West, PA', 'Location_Allen, TX',
       'Location_American Fork, UT', 'Location_Anaheim, CA',
       'Location_Arcadia, CA',
       ...
       'Revenue_$10+ billion (USD)', 'Revenue_$100 to $500 million (USD)',
       'Revenue_$2 to $5 billion (USD)', 'Revenue_$25 to $50 million (USD)',
       'Revenue_$5 to $10 billion (USD)', 'Revenue_$5 to $10 million (USD)',
       'Revenue_$50 to $100 million (USD)',
       'Revenue_$500 million to $1 billion (USD)',
       'Revenue_Less than $1 million (USD)',
       'Revenue_Unknown / Non-Applicable'],
      dtype='object', length=379)


Unnamed: 0,Rating,"Location_Addison, TX","Location_Alachua, FL","Location_Alameda, CA","Location_Alhambra, CA","Location_Allegheny West, PA","Location_Allen, TX","Location_American Fork, UT","Location_Anaheim, CA","Location_Arcadia, CA",...,Revenue_$10+ billion (USD),Revenue_$100 to $500 million (USD),Revenue_$2 to $5 billion (USD),Revenue_$25 to $50 million (USD),Revenue_$5 to $10 billion (USD),Revenue_$5 to $10 million (USD),Revenue_$50 to $100 million (USD),Revenue_$500 million to $1 billion (USD),Revenue_Less than $1 million (USD),Revenue_Unknown / Non-Applicable
0,3.2,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3.8,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3.4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,3.9,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1849,4.1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1850,2.5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1851,2.9,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1852,3.1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [58]:
y_label = LabelEncoder().fit_transform(df['Salary Estimate'])
y_label[0:100]

array([17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
       17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 38, 38, 38, 38, 38, 38, 38,
       38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 49, 49,
       49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
       49, 49, 49, 49, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
       48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 63, 63, 63, 63])

In [59]:
df["Salary Estimate"].unique()

array(['$37K-$66K (Glassdoor est.)', '$46K-$87K (Glassdoor est.)',
       '$51K-$88K (Glassdoor est.)', '$51K-$87K (Glassdoor est.)',
       '$59K-$85K (Glassdoor est.)', '$43K-$76K (Glassdoor est.)',
       '$60K-$110K (Glassdoor est.)', '$41K-$78K (Glassdoor est.)',
       '$45K-$88K (Glassdoor est.)', '$73K-$127K (Glassdoor est.)',
       '$84K-$90K (Glassdoor est.)', '$27K-$52K (Glassdoor est.)',
       '$42K-$74K (Glassdoor est.)', '$77K-$132K (Glassdoor est.)',
       '$98K-$114K (Glassdoor est.)', '$48K-$96K (Glassdoor est.)',
       '$26K-$47K (Glassdoor est.)', '$31K-$59K (Glassdoor est.)',
       '$47K-$81K (Glassdoor est.)', '$43K-$69K (Glassdoor est.)',
       '$49K-$112K (Glassdoor est.)', '$30K-$54K (Glassdoor est.)',
       '$55K-$103K (Glassdoor est.)', '$37K-$70K (Glassdoor est.)',
       '$57K-$103K (Glassdoor est.)', '$35K-$45K (Glassdoor est.)',
       '$42K-$66K (Glassdoor est.)', '$65K-$81K (Glassdoor est.)',
       '$113K-$132K (Glassdoor est.)', '$60K-$66K (Glas

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label, random_state=1)

In [61]:
# Scale the X data using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[ 1.02564092, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045,  1.75733756],
       [ 0.25668683, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045,  1.75733756],
       [-0.66605807, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045, -0.56904264],
       ...,
       [ 0.10289601, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045, -0.56904264],
       [-0.51226726, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045, -0.56904264],
       [-0.0508948 , -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045, -0.56904264]])

In [62]:
# Transform the test dataset based on the fit from the training data
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[ 1.94838582, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045,  1.75733756],
       [ 1.94838582, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045, -0.56904264],
       [ 0.10289601, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045, -0.56904264],
       ...,
       [ 0.10289601, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045, -0.56904264],
       [ 0.10289601, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045, -0.56904264],
       [-0.51226726, -0.02683174, -0.02683174, ..., -0.2086709 ,
        -0.19910045, -0.56904264]])

In [63]:
#X_train_scaled.to_csv("temp_scaled.csv")
#np.savetxt("temp_scaled.csv", X_train_scaled, delimiter=",")

In [64]:
#np.savetxt("tempy_scaled.csv", y_train, delimiter=",")

In [65]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.8712230215827338
Testing Score: 0.24568965517241378
