In [1]:
import pandas as pd
import numpy as np

# For model building
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

# for model validation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('new_companies.csv')

In [3]:
df.drop(['Unnamed: 0'], axis = 1, inplace=True)

In [4]:
df.head()

Unnamed: 0,founded_at,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,status,isClosed
0,-0.113605,-0.879874,-1.306764,-0.791392,0.001246,-0.753281,0.361501,1.733481,0,0
1,-0.799293,0.310334,0.454866,1.970875,0.773914,-0.084546,-0.357688,-0.735943,1,1
2,-0.799293,-2.863553,-3.508802,-0.791392,-0.714844,-1.087648,-1.436472,-0.735943,1,1
3,-0.627871,0.310334,0.014459,-0.791392,-0.526399,-0.084546,-0.357688,-0.735943,1,1
4,-0.285027,-1.276609,-1.747172,-0.791392,-0.375644,-0.084546,0.361501,0.498769,1,1


In [5]:
df.columns

Index(['founded_at', 'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'first_milestone_at', 'last_milestone_at',
       'milestones', 'status', 'isClosed'],
      dtype='object')

#### Status Column

0 - Acquired

1 - Operating

2 - Closed

3 - IPO

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9808 entries, 0 to 9807
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   founded_at          9808 non-null   float64
 1   first_funding_at    9808 non-null   float64
 2   last_funding_at     9808 non-null   float64
 3   funding_rounds      9808 non-null   float64
 4   funding_total_usd   9808 non-null   float64
 5   first_milestone_at  9808 non-null   float64
 6   last_milestone_at   9808 non-null   float64
 7   milestones          9808 non-null   float64
 8   status              9808 non-null   int64  
 9   isClosed            9808 non-null   int64  
dtypes: float64(8), int64(2)
memory usage: 766.4 KB


## 1. QDA

- Method designed to separate two or more classes based on a combination of features in a normal ditribution, where it assumes each feature has its own covariance matrix.

- QDA is more flexible with high variance data.

- QDA uses a classifier with a quadratic decision boundary, where each class is fitted with a Gaussian density.

- This classifier was mainly used to classify between 2 classes only, Operating and Not Operating.

In [7]:
# define predictor and response variables
X = df.drop(['status','isClosed'], axis = 1)
y = df['isClosed']

In [8]:
# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Fit the QDA model
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train, y_train)

In [10]:
preds = qda_model.predict(X_test)

In [11]:
pred_df = pd.DataFrame({'Original' : y_test, 'Prediction' : preds, 'Error' : y_test - preds})
pred_df 

Unnamed: 0,Original,Prediction,Error
9248,1,1,0
1463,1,0,1
4928,1,1,0
3057,1,1,0
2450,1,1,0
...,...,...,...
6967,1,1,0
1916,1,1,0
3079,1,1,0
5512,0,1,-1


### Model Validation

In [12]:
# Mean Absolute Error
mean_absolute_error(y_test, preds) 

0.16476345840130505

In [13]:
# Cross Validation
accuracies = cross_val_score(qda_model, X_train, y_train, cv=5)
accuracies

array([0.82608696, 0.82121006, 0.84296397, 0.82596873, 0.82256968])

In [14]:
# Confusion Metrix
cm_pred = confusion_matrix(y_test, preds)
print('Confusion Metrix for Prediction Data :\n\n',cm_pred)

Confusion Metrix for Prediction Data :

 [[ 119  261]
 [ 143 1929]]


In [15]:
# classification Report
print('Classification Report for Prediction Data :\n\n',classification_report(y_test, preds))

Classification Report for Prediction Data :

               precision    recall  f1-score   support

           0       0.45      0.31      0.37       380
           1       0.88      0.93      0.91      2072

    accuracy                           0.84      2452
   macro avg       0.67      0.62      0.64      2452
weighted avg       0.81      0.84      0.82      2452



In [16]:
# Accuracy Score
print('Accuracy Score for Prediction Data :',accuracy_score(y_test, preds))

Accuracy Score for Prediction Data : 0.835236541598695


## 2. Random Forest Classifier

- Classification algorithm consisting of a large number of individual decision trees that operate as an ensemble.

- Each tree predicts a decision/class, and the decision with the most votes becomes the final prediction.

- Low correlation between data features helps this ensemble reach better scores.

- This classifier was used to classify between all 4 classes, Operating, IPO, Acquired, and Closed.

In [17]:
X = df.drop(['status', 'isClosed'], axis = 1)
y = df['status']

In [18]:
rf_model = RandomForestRegressor(random_state=1)

In [19]:
# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [20]:
rf_model.fit(X_train, y_train)

In [21]:
preds = rf_model.predict(X_test)

pred_list = []
for i in preds:
    pred_list.append(round(i))


In [22]:
pred_df = pd.DataFrame({'Original' : y_test, 'Prediction' : pred_list, 'Error' : y_test - pred_list})
pred_df 

Unnamed: 0,Original,Prediction,Error
9248,1,1,0
1463,1,1,0
4928,1,1,0
3057,1,1,0
2450,1,1,0
...,...,...,...
6967,1,1,0
1916,1,1,0
3079,1,1,0
5512,2,2,0


In [23]:
pred_df.Prediction.value_counts()

1    2296
0      89
2      66
3       1
Name: Prediction, dtype: int64

### Model Valuation

In [24]:
#1 Mean Absolute Error
mean_absolute_error(y_test, pred_list)

0.20187601957585644

In [25]:
# Confusion Metrix
cm_pred = confusion_matrix(y_test, pred_list)
print('Confusion Metrix for Prediction Data :\n\n',cm_pred)

Confusion Metrix for Prediction Data :

 [[  29  199    5    0]
 [  53 1949   42    1]
 [   7  128   12    0]
 [   0   20    7    0]]


In [26]:
# classification Report
print('Classification Report for Prediction Data :\n\n',classification_report(y_test, pred_list))

Classification Report for Prediction Data :

               precision    recall  f1-score   support

           0       0.33      0.12      0.18       233
           1       0.85      0.95      0.90      2045
           2       0.18      0.08      0.11       147
           3       0.00      0.00      0.00        27

    accuracy                           0.81      2452
   macro avg       0.34      0.29      0.30      2452
weighted avg       0.75      0.81      0.77      2452



In [27]:
# Accuracy Score
print('Accuracy Score for Prediction Data :',accuracy_score(y_test, pred_list))

Accuracy Score for Prediction Data : 0.8115823817292006


## 3. Predictor Function

User defined function that combines qda_model results and rf_model results to give the final prediction. 

In [28]:
# for dataframe

def Predictor_df(to_predict, model_qda, model_rf):

    preds = model_qda.predict(to_predict)
    pred_list = []
    for i in range(0,len(preds)):
        # 0 : operating, 1 : closed
        if preds[i] == 1:
            out = 1
            pred_list.append(out)
        else:
            pred = model_rf.predict(to_predict[i:i+1])
            # 0 : acquired, 1 : operating, 2 : closed, 3 : ipo
            if pred == 0:
                out = 0
                    
            elif pred == 1:
                out = 1
                    
            elif pred == 2:
                out = 2
                    
            else:
                out = 3
            pred_list.append(out)       
        

    return pred_list

In [29]:
pred_list = Predictor_df(X_test, qda_model, rf_model)

In [30]:
len(pred_list)

2452

In [31]:
pred_df = pd.DataFrame({'Original' : y_test, 'Prediction' : pred_list, 'Error' : y_test - pred_list})
pred_df 

Unnamed: 0,Original,Prediction,Error
9248,1,1,0
1463,1,3,-2
4928,1,1,0
3057,1,1,0
2450,1,1,0
...,...,...,...
6967,1,1,0
1916,1,1,0
3079,1,1,0
5512,2,1,1


In [32]:
pred_df.Prediction.value_counts()

1    2194
3     258
Name: Prediction, dtype: int64

In [33]:
#1 Mean Absolute Error
mean_absolute_error(y_test, pred_list)

0.34991843393148453

In [34]:
# Confusion Metrix
cm_pred = confusion_matrix(y_test, pred_list)
print('Confusion Metrix for Prediction Data :\n\n',cm_pred)

Confusion Metrix for Prediction Data :

 [[   0  140    0   93]
 [   0 1916    0  129]
 [   0  121    0   26]
 [   0   17    0   10]]


In [35]:
# classification Report
print('Classification Report for Prediction Data :\n\n',classification_report(y_test, pred_list))

Classification Report for Prediction Data :

               precision    recall  f1-score   support

           0       0.00      0.00      0.00       233
           1       0.87      0.94      0.90      2045
           2       0.00      0.00      0.00       147
           3       0.04      0.37      0.07        27

    accuracy                           0.79      2452
   macro avg       0.23      0.33      0.24      2452
weighted avg       0.73      0.79      0.75      2452



In [36]:
# Accuracy Score
print('Accuracy Score for Prediction Data :',accuracy_score(y_test, pred_list))

Accuracy Score for Prediction Data : 0.7854812398042414


In [37]:
# for single entries
def Predictor(to_predict, model_qda, model_rf):
    #size = len(to_predict_list)
    #to_predict = np.array(to_predict_list).reshape(1, size)

    preds = model_qda.predict(to_predict)
    # 0 : operating, 1 : closed
    if preds == 1:
        out = 'Operating'
        print(1)
    else:
        pred = np.ndarray.tolist(model_rf.predict(to_predict))
        pred = round(pred[0])
        # result = str(preds[0])
        # 0 : acquired, 1 : operating, 2 : closed, 3 : ipo
        if pred == 0:
            out = 'Acquired'
        elif pred == 1:
            out = 'Operating'
        elif pred == 2:
            out = 'Closed'
        else:
            out = 'IPO'

    return out

In [38]:
y_test[-1:]

5197    1
Name: status, dtype: int64

In [39]:
X_test[-1:]

Unnamed: 0,founded_at,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones
5197,0.743504,1.103806,0.895274,-0.791392,-0.747106,0.918555,0.721096,-0.735943
