# Data Cleaning 

In [1]:
import numpy as np
from sklearn.impute import SimpleImputer
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
Devops = pd.read_csv("D:/yesss.csv")

In [3]:
#Fill the 'Raised to Date', 'Deal Size', 'Total Invested Equity'with Mean
Devops['Raised to Date'] = Devops['Raised to Date'].fillna(Devops['Raised to Date'].mean())
Devops['Deal Size'] = Devops['Deal Size'].fillna(Devops['Deal Size'].mean())
Devops['Total Invested Equity'] = Devops['Total Invested Equity'].fillna(Devops['Total Invested Equity'].mean())

In [4]:
#Fill the 'Current Employees'with Median
Devops['Current Employees'] = Devops['Current Employees'].fillna(Devops['Current Employees'].median())

In [5]:
#Fill the "# Investors','# New Investors' with 0 for NaN
Devops['# Investors'] = Devops['# Investors'].fillna(0)
Devops['# New Investors'] = Devops['# New Investors'].fillna(0)

In [6]:
#Fill the "VC Round' with 'No Round' for NaN
Devops['VC Round'] = Devops['VC Round'].fillna('No Round')

#Fill the "Financing status' with 'No Round' for NaN
Devops['Financing Status'] = Devops['Financing Status'].fillna('None')

#Fill the "Year Founded" with Mode
Devops['Year Founded'] = Devops['Year Founded'].fillna(2014)

In [7]:
# Fill the Premoney Valuation, Post Valuation with -1
Devops['Pre-money Valuation'] = Devops['Pre-money Valuation'].fillna(-1)
Devops['Post Valuation'] = Devops['Post Valuation'].fillna(-1)

#Fill the "%Acquired", "price per Shae" columns with 0
Devops['% Acquired'] = Devops['% Acquired'].fillna(0)
Devops['Price per Share'] = Devops['Price per Share'].fillna(0)


#Fill the Series column with "No categorization"
Devops['Series'] = Devops['Series'].fillna('No Categorization')

In [8]:
Devops.isna().any()

Deal ID                     False
Company ID                  False
Primary Industry Sector     False
Primary Industry Group      False
Primary Industry Code       False
All Industries              False
Verticals                   False
Keywords                    False
Current Financing Status    False
Current Business Status     False
Universe                    False
CEO PBId                     True
CEO Education                True
Deal No.                    False
Deal ID.1                   False
Deal Date                   False
Deal Size                   False
Deal Size Status            False
Pre-money Valuation         False
Post Valuation              False
% Acquired                  False
Raised to Date              False
VC Round                    False
Price per Share             False
Series                      False
Deal Type                   False
Deal Class                  False
Total Invested Equity       False
Deal Status                 False
Business Statu

In [10]:
#Save the column after we are done 
Devops.to_csv("D:/DevOps_Cleaned.csv", index=False)

# Boruta feature engineering (using R Studio)

In [None]:
#Input the dataset inside
library(Boruta)
DevOps<- read.csv('D:/DevOps_Cleaned.csv')
library(caTools)                            #Split the dataset 


#Drop the Columns that are not related 
DevOps[ , c('Deal.ID', 'Company.ID','CEO.PBId','CEO.Education','HQ.Global.Sub.Region','Company.City')] <- list(NULL)


##########FEATURE IMPORTANCE ANALYSIS WITH BORUTA FOR Premoney valuation#################################
#Drop the Post Valuation 
DevOps_PreMoneyValuation = subset(DevOps, select = -c(Post.Valuation) )

#Create copy of the original dataset remove the medical results columns 
DevOps_train_1 = DevOps_PreMoneyValuation

#Perform Boruta analysis on the training set 
set.seed(100)
boruta_analysis_1 = Boruta(Pre.money.Valuation~., data = DevOps_train_1, maxRuns=200)

#Plot the Boruta results abd see the dataframe 
plot(boruta_analysis_1,las=2,main="Boruta Analysis: Feature Importance for Pre.money.Valuation")
as.data.frame(boruta_analysis_1$finalDecision)
print(boruta_analysis_1)

#Get all the confirmed columns as we see here 
final.boruta <- TentativeRoughFix(boruta_analysis_1)
getSelectedAttributes(final.boruta, withTentative = F)

#Create the data frame of the final result derived from Boruta.
boruta.df <- attStats(final.boruta)
class(boruta.df)
print(boruta.df)


##########FEATURE IMPORTANCE ANALYSIS WITH BORUTA FOR Postmoney valuation#################################
#Drop the Post Valuation 
DevOps_PostValuation = subset(DevOps, select = -c(Pre.money.Valuation) )

#Create copy of the original dataset remove the medical results columns 
DevOps_train_2 = DevOps_PostValuation

#Perform Boruta analysis on the training set 
set.seed(110)
boruta_analysis_2 = Boruta(Post.Valuation~., data = DevOps_train_2, maxRuns=200)

#Plot the Boruta results abd see the dataframe 
plot(boruta_analysis_2,las=2,main="Boruta Analysis: Feature Importance for Post.Valuation ")
as.data.frame(boruta_analysis_2$finalDecision)
print(boruta_analysis_2)

#Get all the confirmed columns as we see here 
final.boruta <- TentativeRoughFix(boruta_analysis_2)
getSelectedAttributes(final.boruta, withTentative = F)

#Create the data frame of the final result derived from Boruta.
boruta.df <- attStats(final.boruta)
class(boruta.df)
print(boruta.df)


# Regression analysis

After performing the Boruta analysis with R, I will do the rest in Python 

In [10]:
#Install additionall packages 
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 


from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import  r2_score, mean_squared_error 
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Ridge

### Premoney

In [11]:
#Assign the values to X and Y 
X_pre = Devops[["Deal Size", "% Acquired", "Raised to Date","VC Round",
                                 "Price per Share","Series","Deal Type", "Total Invested Equity",
                                 "Current Employees", "HQ Global Region", "Company Country"]]
y_pre= Devops[["Pre-money Valuation"]]

In [12]:
#Transform dataset into workable format
encode=LabelEncoder()
X_pre = X_pre.apply(encode.fit_transform)


#Splitting the training set and test set 
random.seed(111)
X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size =0.25, random_state=42)

In [13]:
#Building the regressor and fit the dataset in
#Random Forest
regressor_pre1 = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=42, n_jobs=-1)
regressor_pre1.fit(X_train_pre, y_train_pre.squeeze())

#Decision Tree
regressor_pre2 = DecisionTreeRegressor(random_state= 0)
regressor_pre2.fit(X_train_pre, y_train_pre)

#Ridge Regression
regressor_pre3 = Ridge(alpha=1.0)
regressor_pre3.fit(X_train_pre, y_train_pre)

Ridge()

In [14]:
#Model accuracties and validation analysis
print('PRE MONEY VALUTION')

#Random forest
y_train_pre_preds1 = regressor_pre1.predict(X_train_pre)
y_test_pre_preds1 = regressor_pre1.predict(X_test_pre)
print('Random Forest')
print('MSE- train: %.3f, test: %.3f' % (mean_squared_error(y_train_pre, y_train_pre_preds1), mean_squared_error(y_test_pre, y_test_pre_preds1)))
print('R^2- train: %.3f, test: %.3f' % (r2_score(y_train_pre, y_train_pre_preds1),r2_score(y_test_pre, y_test_pre_preds1)))
print('-----------------------------------------------------')

#Decision Tree
y_train_pre_preds2 = regressor_pre2.predict(X_train_pre)
y_test_pre_preds2 = regressor_pre2.predict(X_test_pre)
print('Decision Tree')
print('MSE- train: %.3f, test: %.3f' % ( mean_squared_error(y_train_pre, y_train_pre_preds2), mean_squared_error(y_test_pre, y_test_pre_preds2)))
print('R^2c train: %.3f, test: %.3f' % ( r2_score(y_train_pre, y_train_pre_preds2),r2_score(y_test_pre, y_test_pre_preds2)))
print('-----------------------------------------------------')


#Ridge Regression
y_train_pre_preds3 = regressor_pre3.predict(X_train_pre)
y_test_pre_preds3 = regressor_pre3.predict(X_test_pre)
print('Ridge Regression')
print('MSE- train: %.3f, test: %.3f' % ( mean_squared_error(y_train_pre, y_train_pre_preds3), mean_squared_error(y_test_pre, y_test_pre_preds3)))
print('R^2- train: %.3f, test: %.3f' % ( r2_score(y_train_pre, y_train_pre_preds3),r2_score(y_test_pre, y_test_pre_preds3)))

PRE MONEY VALUTION
Random Forest
MSE- train: 11727.508, test: 39093.378
R^2- train: 0.963, test: 0.809
-----------------------------------------------------
Decision Tree
MSE- train: 0.000, test: 126837.930
R^2c train: 1.000, test: 0.381
-----------------------------------------------------
Ridge Regression
MSE- train: 239609.639, test: 143237.836
R^2- train: 0.251, test: 0.301


Because the value is too small for Ridge Regression, we will drop it for Post Valuation 

### Postmoney

In [15]:
#Assign the values to X and Y 
X_post = Devops[["Deal Size", "% Acquired", "Raised to Date","VC Round",
                                 "Price per Share","Series","Deal Type", "Total Invested Equity",
                                  "Financing Status", "# Investors" , "Current Employees", "HQ Location",
                                  "HQ Global Region", "Company Country", "Year Founded"]]
y_post= Devops[["Post Valuation"]]

In [16]:
#Transform dataset into workable format
encode=LabelEncoder()
X_post = X_post.apply(encode.fit_transform)


#Splitting the training set and test set 
random.seed(112)
X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size =0.25, random_state=42)

In [17]:
#Building the regressor and fit the dataset in
#Random Forest
regressor_post1 = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=42, n_jobs=-1)
regressor_post1.fit(X_train_post, y_train_post.squeeze())

#Decision Tree
regressor_post2 = DecisionTreeRegressor(random_state= 0)
regressor_post2.fit(X_train_post, y_train_post)

DecisionTreeRegressor(random_state=0)

In [18]:
#Model accuracties and validation analysis
print('POST VALUATION')


#Random forest
y_train_post_preds1 = regressor_post1.predict(X_train_post)
y_test_post_preds1 = regressor_post1.predict(X_test_post)
print('Random Forest')
print('MSE- train: %.3f, test: %.3f' % (mean_squared_error(y_train_post, y_train_post_preds1), mean_squared_error(y_test_post, y_test_post_preds1)))
print('R^2- train: %.3f, test: %.3f' % (r2_score(y_train_post, y_train_post_preds1),r2_score(y_test_post, y_test_post_preds1)))
print('-----------------------------------------------------')

#Decision Tree
y_train_post_preds2 = regressor_post2.predict(X_train_post)
y_test_post_preds2 = regressor_post2.predict(X_test_post)
print('Decision Tree')
print('MSE- train: %.3f, test: %.3f' % ( mean_squared_error(y_train_post, y_train_post_preds2), mean_squared_error(y_test_post, y_test_post_preds2)))
print('R^2- train: %.3f, test: %.3f' % ( r2_score(y_train_post, y_train_post_preds2),r2_score(y_test_post, y_test_post_preds2)))


POST VALUATION
Random Forest
MSE- train: 12950.033, test: 42631.631
R^2- train: 0.965, test: 0.820
-----------------------------------------------------
Decision Tree
MSE- train: 0.000, test: 73829.949
R^2- train: 1.000, test: 0.688


# Improving the models

### K-fold cross validation

In [19]:
#Cross Validation for Prevalue's Random Forest
from sklearn.model_selection import cross_val_score
R2_pre = cross_val_score(estimator = regressor_pre1, X = X_train_pre, y = y_train_pre, cv = 10, scoring = 'r2' )
                            #estimator: estimator object implementing 'fit'. The object that u use to fit the model. we called 'classifier' above 
                            #cv:the number of folds u want to split teh training set into. Most common is 10
                            #We can check the accurracies here if we want.
#Now we take the mean and std of the accuracies then we can know how well the model perform 
a1= R2_pre.mean()
a2= R2_pre.std()
print('R^2 of Prevalue Random Forest folds are: ',R2_pre)
print('Mean of the RD R^2 is: ',round(a1,3))
print('STD of the RD R^2 is: ',round(a2,3))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


R^2 of Prevalue Random Forest folds are:  [ 0.72657822  0.23103399  0.92449948  0.96040389 -0.1532754   0.23168322
  0.63462776  0.84304347  0.67177795 -0.27132277]
Mean of the RD R^2 is:  0.48
STD of the RD R^2 is:  0.421


In [20]:
#Cross Validation for PostValue's Random Forest 
from sklearn.model_selection import cross_val_score
R2_post = cross_val_score(estimator = regressor_post1, X = X_train_post, y = y_train_post, cv = 10, scoring = 'r2' )

a3= R2_post.mean()
a4= R2_post.std()
print('R^2 of PostValue Random Forest folds are: ',R2_post)
print('Mean of the RD R^2 is: ',round(a3,3))
print('STD of the RD R^2 is: ',round(a4,3))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


R^2 of PostValue Random Forest folds are:  [ 0.86371967  0.42666836  0.91390033  0.90356983 -0.24090354  0.3613858
  0.61175546  0.80660106  0.71845119  0.11358891]
Mean of the RD R^2 is:  0.548
STD of the RD R^2 is:  0.363


### Grid Search 

In [27]:
#Appplying the Grid search to find the best model and best parameters for for PreValue's Random Forest
from sklearn.model_selection import GridSearchCV
parameters = [{'bootstrap': [True, False],
                'max_depth': [10, 20, 30, None],
                'max_features': ['auto', 'sqrt'],
                 'n_estimators': [10, 30, 200,300]}]                    
                    #Find the parameters (key identifiers) that we can optimize 
                    #Input a list of dictionaries, then we have key identifiers that weill be tested with the grid serach model and the grid search model will find the best one 
grid_search_pre = GridSearchCV(estimator = regressor_pre1,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)
                    #estimator: our machine learning model
                    #param_gid: 'parameters' are teh thigns we try to compare 
                    #scoring: the metrics, it can be anything - accuracy, precision,recall,etc.Accuracy is the most common one 
                    #cv: the k in kfold cross validation, the true grid will evalute the accuracy on each of fold 
                    #n_jobs=-1: in case we will use this on the big dataset  
grid_search_pre  = grid_search_pre.fit(X_train_pre, y_train_pre)
best_R2_pre = grid_search_pre.best_score_
best_parameters_pre = grid_search_pre.best_params_

print('Best R^2 of Grid Search for PreValue RF: ',best_R2_pre)
print('Best Parameters of Grid Search for PreValue RF: ',best_parameters_pre)

Best R^2 of Grid Search for PreValue RF:  0.7379953329822534
Best Parameters of Grid Search for PreValue RF:  {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 10}


  self.best_estimator_.fit(X, y, **fit_params)


In [28]:
#Appplying the Grid search to find the best model and best parameters for for PostValue's Random Forest
grid_search_post = GridSearchCV(estimator = regressor_post1,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)  
grid_search_post  = grid_search_post.fit(X_train_post, y_train_post)
best_R2_post = grid_search_post.best_score_
best_parameters_post = grid_search_post.best_params_

print('Best R^2 of Grid Search for PostValue RF : ',best_R2_post)
print('Best Parameters of Grid Search for PostValue RF: ',best_parameters_post)

Best R^2 of Grid Search for PostValue RF :  0.7126914562524338
Best Parameters of Grid Search for PostValue RF:  {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 10}


  self.best_estimator_.fit(X, y, **fit_params)


In [26]:
#Applying The best model to the test set 
#PreValuation dataset 
regressor_preOptimized = RandomForestRegressor(n_estimators=10, criterion='mse', random_state=42, n_jobs=-1,
                                               bootstrap= False, max_depth = 20, max_features = 'sqrt')
regressor_preOptimized.fit(X_train_pre, y_train_pre.squeeze())
y_test_preOptimized_preds = regressor_preOptimized.predict(X_test_pre)
print('Optimized Random Forest for PreValuation')
print('MSE- test: %.3f' % (mean_squared_error(y_test_pre, y_test_preOptimized_preds)))
print('R^2- test: %.3f' % (r2_score(y_test_pre, y_test_preOptimized_preds)))
print('-----------------------------------------------------')

#PostValuation dataset 
regressor_postOptimized = RandomForestRegressor(n_estimators=10, criterion='mse', random_state=42, n_jobs=-1,
                                                bootstrap= False, max_depth = None, max_features = 'sqrt')
regressor_postOptimized.fit(X_train_post, y_train_post.squeeze())
y_test_postOptimized_preds = regressor_postOptimized.predict(X_test_post)
print('Optimized Random Forest for PostValuation')
print('MSE- test: %.3f' % (mean_squared_error(y_test_post, y_test_postOptimized_preds)))
print('R^2- test: %.3f' % (r2_score(y_test_post, y_test_postOptimized_preds)))


Optimized Random Forest for PreValuation
MSE- test: 60472.714
R^2- test: 0.705
-----------------------------------------------------
Optimized Random Forest for PostValuation
MSE- test: 52246.255
R^2- test: 0.779
