In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import time
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pickle
import matplotlib.pyplot as plt

In [5]:
import warnings 
warnings.filterwarnings('ignore')

In [6]:
def selectkbest(indep_X, dep_Y, n):
    test = SelectKBest(score_func=f_regression, k=n)  # Use f_regression for continuous target
    fit1 = test.fit(indep_X, dep_Y)
    selectk_features = fit1.transform(indep_X)
    return selectk_features



In [7]:
def split_scalar(indep_X,dep_Y):
    
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test
    
def r2_prediction(regressor,X_test,y_test):
    
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2=r2_score(y_test,y_pred)
    return r2
 
def Linear(X_train,y_train,X_test):  
    
        # Fitting K-NN to the Training set
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2   
    
def svm_linear(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1, 1, 10]}  # Regularization parameter
    grid_search = GridSearchCV(SVR(kernel='linear'), param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_regressor = grid_search.best_estimator_
    return r2_prediction(best_regressor, X_test, y_test)
    
def svm_NL(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
    grid_search = GridSearchCV(SVR(kernel='rbf'), param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_regressor = grid_search.best_estimator_
    return r2_prediction(best_regressor, X_test, y_test)
    
def Decision(X_train, y_train, X_test, y_test):
    param_grid = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
    grid_search = GridSearchCV(DecisionTreeRegressor(random_state=0), param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_regressor = grid_search.best_estimator_
    return r2_prediction(best_regressor, X_test, y_test)

# Function to train and tune Random Forest
def random(X_train, y_train, X_test, y_test):
    param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
    grid_search = GridSearchCV(RandomForestRegressor(random_state=0), param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_regressor = grid_search.best_estimator_
    return r2_prediction(best_regressor, X_test, y_test)

In [8]:
def selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf): 
    
    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Linear','SVMl','SVMnl','Decision','Random'])
    for number,idex in enumerate(dataframe.index):    
        dataframe['Linear'][idex]=acclin[number]       
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe

In [9]:
dataset=pd.read_csv("Pre_AI_Companies.csv",index_col=None)

df=dataset

df.drop(['Company Name','Description','Headquarters'],axis=1,inplace=True)

In [10]:
df

Unnamed: 0,Founded,Annual Revenue,Glassdoor Score
0,2009.0,479.5,3.75
1,2012.0,338.2,3.75
2,1998.0,305600.0,4.45
3,2016.0,40.0,4.35
4,2011.0,69.2,3.15
...,...,...,...
95,2016.0,438.8,3.85
96,2016.0,100.0,3.45
97,2003.0,96770.0,3.65
98,2016.0,1400.0,3.75


In [18]:
indep_X=df.drop(['Annual Revenue'], axis=1)
dep_Y=df['Annual Revenue']

In [19]:
kbest=selectkbest(indep_X,dep_Y,2)      

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]

In [20]:
X_train, X_test, y_train, y_test=split_scalar(kbest,dep_Y)  
for i in kbest:   
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=svm_linear(X_train,y_train,X_test,y_test)    
    accsvml.append(r2_sl)
    
    r2_NL=svm_NL(X_train,y_train,X_test,y_test)
    accsvmnl.append(r2_NL)
    
    r2_d=Decision(X_train,y_train,X_test,y_test)
    accdes.append(r2_d)
    
    r2_r=random(X_train,y_train,X_test,y_test)
    accrf.append(r2_r)
    
    
result=selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf)

In [21]:
result

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.092687,-0.081292,-0.082918,0.084117,-0.09873
