In [2]:
#Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
#Import pre-processed dataframe
ml_df = pd.read_csv('C:/datascience/springboard/projects/Venture Capital/data/Exported Data/ML Dataframe.csv', encoding="ISO-8859-1")

In [4]:
#Show summary information of Venture Capital Dataframe
ml_df.info(verbose=True)

#Set index to company name for reference (not a feature)
ml_df = ml_df.set_index('company_name')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20621 entries, 0 to 20620
Data columns (total 58 columns):
company_name         20621 non-null object
status               20621 non-null object
funding_rounds       20621 non-null float64
invscore_1           20621 non-null float64
invscore_10          20621 non-null float64
invscore_11          20621 non-null float64
invscore_12          20621 non-null float64
invscore_2           20621 non-null float64
invscore_3           20621 non-null float64
invscore_4           20621 non-null float64
invscore_5           20621 non-null float64
invscore_6           20621 non-null float64
invscore_7           20621 non-null float64
invscore_8           20621 non-null float64
invscore_9           20621 non-null float64
invct_1              20621 non-null float64
invct_10             20621 non-null float64
invct_11             20621 non-null float64
invct_12             20621 non-null float64
invct_2              20621 non-null float64
invct_3      

In [5]:
#Before Encoding print value_counts of each label (4)
print("Full Dataset Target Label Counts:", '\n', ml_df['status'].value_counts(), '\n')
print("Full Dataset Target as % of Total:", '\n', round(ml_df['status'].value_counts() / len(ml_df), 2), '\n')

#Convert df to array and perform one-hot encoding for target variable 'status'
tgt_enc = LabelEncoder().fit(ml_df['status'])
tgt_encoded = tgt_enc.transform(ml_df['status'])
ml_df = ml_df.drop(columns='status')

#Print Status Labels and Numbers for reference
print("Target Labels", np.unique(tgt_enc.inverse_transform(tgt_encoded)), '\n' ,"Label Numbers", np.unique(tgt_encoded))

Full Dataset Target Label Counts: 
 operating    15157
acquired      4567
closed         553
ipo            344
Name: status, dtype: int64 

Full Dataset Target as % of Total: 
 operating    0.74
acquired     0.22
closed       0.03
ipo          0.02
Name: status, dtype: float64 

Target Labels ['acquired' 'closed' 'ipo' 'operating'] 
 Label Numbers [0 1 2 3]


Given that 74% of the companies in the dataset are companies still operating, one strategy would be to just predict that every company is operating. This would result in 74% accuracy, therefore one of my criteria is to acheive an accuracy score of greater than 74% to be considered useful. 

In [6]:
#Convert dataframe to array and separate feature data (X) from target data (Y)
X = ml_df.values
Y = tgt_encoded

#Scale feature data to acheieve mean of zero for each feature to account for large variation of values
scaled_X = StandardScaler().fit_transform(X)

#Train, test, split data using 70% of data for training
X_train, X_test, y_train, y_test = train_test_split(scaled_X, Y, test_size=0.3) #Random State?

In [7]:
#Visualize feature importance
features_RandTree = {}

for feature, importance in zip(ml_df.columns, model_RandTree.feature_importances_):
    features_RandTree[feature] = importance

importances_RandTree = pd.DataFrame.from_dict(features_RandTree, orient='index').rename(columns={0: 'Gini-Importance'})
importances_RandTree = importances_RandTree.sort_values(by='Gini-Importance', ascending=True)

fig, ax = plt.subplots(figsize=(15,12))
plt.barh(importances_RandTree.index, importances_RandTree['Gini-Importance'])
plt.title('Feature Importance - Random Forest Classifier', fontsize=17)
ax.set_ylabel('Features', fontsize=16)
ax.set_yticklabels(importances_RandTree.index, fontsize=15)
ax.set_xlabel('Gini-Importance', fontsize=16)
plt.tight_layout()
plt.show()

NameError: name 'model_RandTree' is not defined

In [None]:
#Subset original df using identified highest importance features (10% threshold)
sfm_RandTree = SelectFromModel(model_RandTree, threshold=0.10)
sfm_RandTree.fit(X_train, y_train)

#Print selected features
for idx, feature_list_index in enumerate(sfm_RandTree.get_support(indices=True)):
    i = idx + 1
    print("Selected Feature %s:" % i, ml_df.columns[feature_list_index])

#Transform data based on feature selection
X_important_train = sfm_RandTree.transform(X_train)
X_important_test = sfm_RandTree.transform(X_test)

In [None]:
#Parameter Tuning for n_estimators. Using selected features.
n_estimators = range(1, 202, 5)

train_results = []
test_results = []

for estimator in n_estimators:
    #Training data
    rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
    rf.fit(X_important_train, y_train)
    train_pred = rf.predict(X_important_train)
    train_score = round(accuracy_score(y_train, train_pred),2)
    train_results.append(train_score)

    #Test Data
    test_pred = rf.predict(X_important_test)
    test_score = round(accuracy_score(y_test, test_pred),2)
    test_results.append(test_score)

#Plot Results
train_line = plt.plot(n_estimators, train_results, color='blue', label = 'Train Score')
test_line = plt.plot(n_estimators, test_results, color='red', label = 'Test Score')
plt.legend()
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy Scores')
plt.show()

In [None]:
#Parameter Tuning for max_depth. Using selected features and n_estimators=15.
rf_depth = range(1, 50, 1)

train_results = []
test_results = []

for depth in rf_depth:
    #Training data
    rf = RandomForestClassifier(max_depth=depth, n_estimators=50, n_jobs=-1)
    rf.fit(X_important_train, y_train)
    train_pred = rf.predict(X_important_train)
    train_score = round(accuracy_score(y_train, train_pred),2)
    train_results.append(train_score)

    #Test Data
    test_pred = rf.predict(X_important_test)
    test_score = round(accuracy_score(y_test, test_pred),2)
    test_results.append(test_score)

#Plot Results
train_line = plt.plot(rf_depth, train_results, color='blue', label = 'Train Score')
test_line = plt.plot(rf_depth, test_results, color='red', label = 'Test Score')
plt.legend()
plt.xlabel('Max Depth of Random Forest')
plt.ylabel('Accuracy Scores')
plt.show()

In [None]:
#Parameter Tuning for min_samples_split and min_samples_leaf. Using selected features and n_estimators=50.
min_sample = np.arange(0.05, 0.50, 0.05)

train_results = []
test_results = []

for sample in min_sample:
    #Training data
    #I used same code to test leafs, just changed min_samples_split to min_samples_leaf (changed max step value to 0.5 as well)
    rf = RandomForestClassifier(n_estimators=50, min_samples_leaf=sample, n_jobs=-1)
    rf.fit(X_important_train, y_train)
    train_pred = rf.predict(X_important_train)
    train_score = round(accuracy_score(y_train, train_pred),2)
    train_results.append(train_score)

    #Test Data
    test_pred = rf.predict(X_important_test)
    test_score = round(accuracy_score(y_test, test_pred),2)
    test_results.append(test_score)

#Plot Results
train_line = plt.plot(min_sample, train_results, color='blue', label = 'Train Score')
test_line = plt.plot(min_sample, test_results, color='red', label = 'Test Score')
plt.legend()
plt.xlabel('Minimum Sample')
plt.ylabel('Accuracy Scores')
plt.show()

In [None]:
#Using n_estimators=50 and max_depth=20. I decided not to create parameters for min_samples of min_leaf as some of the classes
#had smaller sample sets and didn't want to set thresholds that would potentially ignore these classes. 
#Instantiate RandomForestClassifier for deeper evaulation. 
model_RandTree = RandomForestClassifier(max_depth=10, n_estimators=50, random_state=0, n_jobs=-1)

#Train and create prediction values for full dataset 
model_RandTree.fit(X_train, y_train)
y_pred_RandTree = model_RandTree.predict(X_test)

#Train and create prediction values for selected dataset
model_RandTree.fit(X_important_train, y_train)
y_pred_RandTree_important = model_RandTree.predict(X_important_test)

#Verify model inputs
model_RandTree

In [None]:
#Graph confusion matrix for full dataset and dataset using top features
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,15))
mat1 = confusion_matrix(y_test, y_pred_RandTree)
mat2 = confusion_matrix(y_test, y_pred_RandTree_important)

sns.heatmap(mat1.T, square=True, annot=True, fmt='d', cbar=False, ax=ax1)
ax1.set_title('Confusion Matrix - Full Features (accuracy = %s percent)' % 
              round(accuracy_score(y_test, y_pred_RandTree),2), fontsize=15)
ax1.set_xlabel('True Label', fontsize=13)
ax1.set_ylabel('Predicted Label', fontsize=13)
bottom, top = ax1.get_ylim()
ax1.set_ylim(bottom + 0.5, top - 0.5)

sns.heatmap(mat2.T, square=True, annot=True, fmt='d', cbar=False, ax=ax2)
ax2.set_title('Confusion Matrix - Important Features (accuracy = %s percent)' % 
              round(accuracy_score(y_test, y_pred_RandTree_important),2), fontsize=15)
ax2.set_xlabel('True Label', fontsize=13)
ax2.set_ylabel('Predicted Label', fontsize=13)
bottom, top = ax2.get_ylim()
ax2.set_ylim(bottom + 0.5, top- 0.5)

plt.show()

#Print Status Labels and Numbers for reference
print("Target Labels", np.unique(tgt_enc.inverse_transform(tgt_encoded)), '\n' ,"Label Numbers", np.unique(tgt_encoded))

In [None]:
#Classification Report
print("Random Forest - Full Dataset:" + '\n' + classification_report(y_test, y_pred_RandTree, target_names=np.unique(tgt_enc.inverse_transform(tgt_encoded))))
print("Random Forest - Important Dataset:" + '\n' + classification_report(y_test, y_pred_RandTree_important, target_names=np.unique(tgt_enc.inverse_transform(tgt_encoded))))

In [None]:
#Visualize Tree Plot?

## Takeaways

1) The Random Forest Model predicted approximately 62% of the acquired companies correctly (recall). Considering acquisitions comprised 22% of the original dataset, I consider this accuracy to be reasonably good.

2) Predicted results were given multi-class labels, but predictions were mostly binary. We likely don't have enough datapoints on companies that did an Initial Public Offering (IPO) or Closed. The data distinguished between operating and acquired, which was 96% of the original dataset. This would argue that the model would need to predict at least 50% of the acquisitions to be useful (i.e. binary classification).

3) The average weighted F1 score was 84%, which is useful for class imbalanced data such as this. 

4) The results were virtually identical regardless of if you used the three features or the full dataset. 

