# Modeling

In [66]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score

import nlp_prepare

## Read in the dataset

In [35]:
df = pd.read_csv('cleaned_repos.csv').dropna()

In [36]:
df.language = df.language.str.strip()
df = df.loc[df.language != 'Java']

## Transform text data using TF-IDF

In [37]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.words)
y = df.language

## Split the data

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

## Cross Validation

In [89]:
# pd.DataFrame(LogisticRegression().get_params().keys()).T

In [85]:
# pd.DataFrame(RidgeClassifierCV().get_params().keys()).T

In [86]:
# pd.DataFrame(SVC().get_params().keys()).T

In [87]:
# pd.DataFrame(GradientBoostingClassifier().get_params().keys()).T

In [88]:
# pd.DataFrame(GaussianNB().get_params().keys()).T

In [50]:
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, .01, 1, 100]}

In [75]:
grid_search = GridSearchCV(SVC(), parameters, cv=5)

In [76]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.01, 1, 100]})

In [77]:
grid_search.score(X_test, y_test)

0.7857142857142857

In [78]:
grid_search.best_estimator_

SVC(C=100, gamma=0.01)

In [82]:
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = SVC(C=100, gamma=0.1).fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

# Evaluate

In [83]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 100.00%
---
Confusion Matrix
actual      C++  JavaScript  Python
predicted                          
C++          39           0       0
JavaScript    0          39       0
Python        0           0      31
---
              precision    recall  f1-score   support

         C++       1.00      1.00      1.00        39
  JavaScript       1.00      1.00      1.00        39
      Python       1.00      1.00      1.00        31

    accuracy                           1.00       109
   macro avg       1.00      1.00      1.00       109
weighted avg       1.00      1.00      1.00       109



In [84]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 78.57%
---
Confusion Matrix
actual      C++  JavaScript  Python
predicted                          
C++           9           1       2
JavaScript    1           9       2
Python        0           0       4
---
              precision    recall  f1-score   support

         C++       0.75      0.90      0.82        10
  JavaScript       0.75      0.90      0.82        10
      Python       1.00      0.50      0.67         8

    accuracy                           0.79        28
   macro avg       0.83      0.77      0.77        28
weighted avg       0.82      0.79      0.77        28

