In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import eli5
import pickle

sns.set()
warnings.filterwarnings('ignore')

In [2]:
from sklearn.feature_selection import mutual_info_regression, SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_absolute_percentage_error, mean_squared_error, roc_auc_score, log_loss, precision_recall_fscore_support, mean_absolute_error, plot_roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from imblearn.over_sampling import RandomOverSampler, SMOTE
from eli5.sklearn import PermutationImportance
from pprint import pprint


---

In [3]:
pd.set_option('display.max_columns', None)

---

**loading data + X, y split**

In [4]:
path = 'D:\Internship Database\companies1.csv'
df = pd.read_csv(path)

X = df.copy()
y = df[['status', 'isClosed', 'active_days']]
yStatus = X.pop('status')
yClosed = X.pop('isClosed')
yActive = X.pop('active_days')
X.columns

Index(['Unnamed: 0', 'founded_at', 'funding_rounds', 'funding_total_usd',
       'milestones', 'relationships', 'lat', 'lng', 'category_code_biotech',
       'category_code_consulting', 'category_code_ecommerce',
       'category_code_education', 'category_code_enterprise',
       'category_code_games_video', 'category_code_hardware',
       'category_code_mobile', 'category_code_network_hosting',
       'category_code_other', 'category_code_public_relations',
       'category_code_search', 'category_code_software', 'category_code_web',
       'country_code_BRA', 'country_code_CAN', 'country_code_DEU',
       'country_code_ESP', 'country_code_FRA', 'country_code_GBR',
       'country_code_IND', 'country_code_IRL', 'country_code_ISR',
       'country_code_NLD', 'country_code_USA', 'country_code_other'],
      dtype='object')

In [5]:
X = X[[
    'founded_at', 'funding_rounds', 'funding_total_usd', 'milestones', 'relationships'
]]
X.columns

Index(['founded_at', 'funding_rounds', 'funding_total_usd', 'milestones',
       'relationships'],
      dtype='object')

In [6]:
closed = yClosed.value_counts(); print(closed)
status = yStatus.value_counts(); print(status)

1    39273
0     3965
Name: isClosed, dtype: int64
3    38864
0     2782
1     1183
2      409
Name: status, dtype: int64


---

**train, test split**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

yStatus_train = y_train.iloc[:,0]
yClosed_train = y_train.iloc[:,1]
yActive_train = y_train.iloc[:,2]

yStatus_test = y_test.iloc[:,0]
yClosed_test = y_test.iloc[:,1]
yActive_test = y_test.iloc[:,2]

---

## data preprocessing

In [8]:
over = RandomOverSampler(random_state=0)
smote = SMOTE()

stdscaler = StandardScaler()
scaler = MinMaxScaler()

pca = PCA(n_components=2)

---

## classification

**ensemble learning**

In [9]:
qda = QuadraticDiscriminantAnalysis()

rf = RandomForestClassifier(random_state=0)

In [10]:
estimators = [('QDA', qda), ('RandomForest', rf)]
final_estimator = GradientBoostingClassifier()
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [11]:
model_ensemble = Pipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('classifier', clf)
])

In [12]:
model_ensemble.fit(X_train, yStatus_train)
pred = model_ensemble.predict(X_test)
print('test accuracy = ', round(accuracy_score(yStatus_test, pred)*100, 2), '%')

test accuracy =  89.34 %


In [13]:
print(classification_report(yStatus_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.273     0.005     0.011       560
           1      0.029     0.004     0.007       248
           2      0.375     0.120     0.182        75
           3      0.899     0.993     0.944      7765

    accuracy                          0.893      8648
   macro avg      0.394     0.281     0.286      8648
weighted avg      0.829     0.893     0.850      8648



**QDA**

In [14]:
model_qda = Pipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('classifier', qda)
])

In [15]:
model_qda.fit(X_train, yClosed_train)
pred = model_qda.predict(X_test)
print('test accuracy = ', round(accuracy_score(yClosed_test, pred)*100, 2), '%')

test accuracy =  88.84 %


In [16]:
print(classification_report(yClosed_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.230     0.083     0.122       808
           1      0.911     0.971     0.940      7840

    accuracy                          0.888      8648
   macro avg      0.571     0.527     0.531      8648
weighted avg      0.848     0.888     0.864      8648



**Random Forest**

In [17]:
model_rf = Pipeline([
    ('stdscaler', stdscaler),
    ('pca', pca),
    ('classifier', rf)
])

In [18]:
model_rf.fit(X_train, yStatus_train)
pred = model_rf.predict(X_test)
print('test accuracy = ', round(accuracy_score(yStatus_test, pred)*100, 2), '%')

test accuracy =  88.6 %


In [19]:
print(classification_report(yClosed_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.262     0.048     0.082       808
           1      0.837     0.005     0.010      7840
           2      0.000     0.000     0.000         0
           3      0.000     0.000     0.000         0

    accuracy                          0.009      8648
   macro avg      0.275     0.013     0.023      8648
weighted avg      0.783     0.009     0.017      8648



---

## saving model

In [22]:
pickle.dump(model_ensemble, open('D:\Internship Database\ensemble.pkl', 'wb'))
pickle.dump(model_qda, open('D:\Internship Database\qda.pkl', 'wb'))


In [25]:
pickle.dump(model_rf, open('D:\Internship Database\mod_rf.pkl', 'wb'))

In [None]:
# to load the model :
# pickled_model = pickle.load(open('model.pkl', 'rb'))
# pickled_model.predict(X_test)

---