# Modeling

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score

import main
from model import model_data

## Read in the dataset

In [2]:
X_train, y_train, X_validate, y_validate, X_test, y_test = model_data()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


## Transform text data using TF-IDF

## Split the data

In [3]:
print(X_train.shape)
print(y_train.shape)
print(X_validate.shape)
print(y_validate.shape)
print(X_test.shape)
print(y_test.shape)

(353, 25892)
(353,)
(118, 25892)
(118,)
(118, 25892)
(118,)


## Training Set

### Cross Validation

In [4]:
logit_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

svm_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

boost_params = {'learning_rate': [0.0001, 0.001, 0.01, 0.1]}

### Logistic Regression

In [5]:
logit_grid_search = GridSearchCV(LogisticRegression(), logit_params, cv=5)
logit_grid_search.fit(X_train, y_train)
logit_grid_search.score(X_train, y_train)

0.9915014164305949

In [6]:
logit_grid_search.best_estimator_

LogisticRegression(C=10)

### Support Vector Classifier

In [7]:
svm_grid_search = GridSearchCV(SVC(), svm_params, cv=5)
svm_grid_search.fit(X_train, y_train)
svm_grid_search.score(X_train, y_train)

0.9943342776203966

In [8]:
svm_grid_search.best_estimator_

SVC(C=10, gamma=0.1)

### Naive Bayes

In [9]:
gaus = GaussianNB()
gaus.fit(X_train, y_train)
gaus.score(X_train, y_train)

0.9943342776203966

### Ridge Classifier

In [10]:
clf = RidgeClassifierCV()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.9971671388101983

### Gradient Boost

In [11]:
boost_grid_search = GridSearchCV(GradientBoostingClassifier(), boost_params, cv=5)
boost_grid_search.fit(X_train, y_train)
boost_grid_search.score(X_train, y_train)

0.9971671388101983

In [13]:
boost_grid_search.best_estimator_

GradientBoostingClassifier()

### Random Forest

In [14]:
tree = RandomForestClassifier()
tree.fit(X_train, y_train)
tree.score(X_train, y_train)

1.0

## Validation Set

The best 3 models from the training set.

1. Random Forest
1. Ridge Classifier
1. Gradient Boost

### Random Forest

In [16]:
tree.score(X_validate, y_validate)

0.7457627118644068

### Ridge Classifier

In [17]:
clf.score(X_validate, y_validate)

0.7372881355932204

### Gradient Boost

In [18]:
boost_grid_search.score(X_validate, y_validate)

0.7796610169491526

## Test Set
1. Gradient Boost

In [20]:
boost_grid_search.score(X_test, y_test)

0.7796610169491526

# Evaluate

In [21]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

ml = boost_grid_search.fit(X_train, y_train)

train['predicted'] = ml.predict(X_train)
validate['predicted'] = ml.predict(X_validate)
test['predicted'] = ml.predict(X_test)

In [22]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 99.72%
---
Confusion Matrix
actual      C++  Java  JavaScript  Python
predicted                                
C++          90     0           0       1
Java          0    87           0       0
JavaScript    0     0          91       0
Python        0     0           0      84
---
              precision    recall  f1-score   support

         C++       0.99      1.00      0.99        90
        Java       1.00      1.00      1.00        87
  JavaScript       1.00      1.00      1.00        91
      Python       1.00      0.99      0.99        85

    accuracy                           1.00       353
   macro avg       1.00      1.00      1.00       353
weighted avg       1.00      1.00      1.00       353



In [23]:
print('Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.predicted))

Accuracy: 77.12%
---
Confusion Matrix
actual      C++  Java  JavaScript  Python
predicted                                
C++          22     4           4       3
Java          4    21           2       1
JavaScript    3     3          24       0
Python        1     1           1      24
---
              precision    recall  f1-score   support

         C++       0.67      0.73      0.70        30
        Java       0.75      0.72      0.74        29
  JavaScript       0.80      0.77      0.79        31
      Python       0.89      0.86      0.87        28

    accuracy                           0.77       118
   macro avg       0.78      0.77      0.77       118
weighted avg       0.77      0.77      0.77       118



In [24]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 76.27%
---
Confusion Matrix
actual      C++  Java  JavaScript  Python
predicted                                
C++          22     3           3       1
Java          2    21           0       1
JavaScript    3     4          25       4
Python        3     1           3      22
---
              precision    recall  f1-score   support

         C++       0.76      0.73      0.75        30
        Java       0.88      0.72      0.79        29
  JavaScript       0.69      0.81      0.75        31
      Python       0.76      0.79      0.77        28

    accuracy                           0.76       118
   macro avg       0.77      0.76      0.76       118
weighted avg       0.77      0.76      0.76       118

