# Modeling

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score

import main
from prepare import prep_readme_data

## Read in the dataset

In [2]:
df = pd.read_csv('../../data/prepared/clean_readmes.csv')

In [3]:
df.rename(columns={'watchers': 'watchers_num',
                   'stars': 'stars_num',
                   'forks': 'forks_num',
                   'commits': 'commits_num'},
          inplace=True)

## Transform text data using TF-IDF

In [4]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df.words)
X = pd.concat([df[['watchers_num', 'stars_num', 'forks_num', 'commits_num']], pd.DataFrame(X_tfidf.todense(), columns=tfidf.get_feature_names())], axis=1)
y = df.language

In [5]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(df.words)
tfidfs

<589x25888 sparse matrix of type '<class 'numpy.float64'>'
	with 89418 stored elements in Compressed Sparse Row format>

## Split the data

In [6]:
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, stratify=y_train_validate, test_size=0.25, random_state=1)

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_validate.shape)
print(y_validate.shape)
print(X_test.shape)
print(y_test.shape)

(353, 25892)
(353,)
(118, 25892)
(118,)
(118, 25892)
(118,)


In [8]:
def scale_numeric_columns(X_train, X_validate, X_test):
    scaler = MinMaxScaler()

    # Concatenate scaled test data onto sparse matrix
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[['watchers_num', 'stars_num',
                                                                'forks_num', 'commits_num']]))
    X_train_scaled.rename(columns={0: 'watchers_num',
                                   1: 'stars_num',
                                   2: 'forks_num',
                                   3: 'commits_num'},
                         inplace=True)

    X_train_scaled.index = X_train.index
    X_train.drop(columns=['watchers_num', 'stars_num', 'forks_num', 'commits_num'], inplace=True)
    X_train = pd.concat([X_train, X_train_scaled], axis=1)
    

    # Concatenate scaled test data onto sparse matrix
    X_validate_scaled = pd.DataFrame(scaler.transform(X_validate[['watchers_num', 'stars_num',
                                                            'forks_num', 'commits_num']]))
    X_validate_scaled.rename(columns={0: 'watchers_num',
                                   1: 'stars_num',
                                   2: 'forks_num',
                                   3: 'commits_num'},
                         inplace=True)
    
    X_validate_scaled.index = X_validate.index
    X_validate.drop(columns=['watchers_num', 'stars_num', 'forks_num', 'commits_num'], inplace=True)
    X_validate = pd.concat([X_validate, X_validate_scaled], axis=1)
    
    
    
    # Concatenate scaled test data onto sparse matrix
    X_test_scaled = pd.DataFrame(scaler.transform(X_test[['watchers_num', 'stars_num',
                                                          'forks_num', 'commits_num']]))
    
    X_test_scaled.rename(columns={0: 'watchers_num',
                                  1: 'stars_num',
                                  2: 'forks_num',
                                  3: 'commits_num'},
                         inplace=True)

    X_test_scaled.index = X_test.index
    X_test.drop(columns=['watchers_num', 'stars_num', 'forks_num', 'commits_num'], inplace=True)
    X_test = pd.concat([X_test, X_test_scaled], axis=1)
    
    return X_train, X_validate, X_test


In [9]:
X_train, X_validate, X_test = scale_numeric_columns(X_train, X_validate, X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Cross Validation

In [10]:
logit_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

svm_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

boost_params = {'learning_rate': [0.0001, 0.001, 0.01, 0.1]}

### Logistic Regression

In [11]:
logit_grid_search = GridSearchCV(LogisticRegression(), logit_params, cv=5)

logit_grid_search.fit(X_train, y_train)

logit_grid_search.score(X_validate, y_validate)

0.7372881355932204

In [12]:
logit_grid_search.best_estimator_

LogisticRegression(C=10)

### Support Vector Classifier

In [13]:
# svm_grid_search = GridSearchCV(SVC(), svm_params, cv=5)
# svm_grid_search.fit(X_train, y_train)

# # svm_grid_search.score(X_validate, y_validate)

In [14]:
# svm_grid_search.best_estimator_

### Naive Bayes

In [15]:
gaus = GaussianNB()
gaus.fit(X_train, y_train)
gaus.score(X_validate, y_validate)

0.6440677966101694

### Ridge Classifier

In [16]:
clf = RidgeClassifierCV()
clf.fit(X_train, y_train)
clf.score(X_validate, y_validate)

0.7372881355932204

### Gradient Boost

In [17]:
boost_grid_search = GridSearchCV(GradientBoostingClassifier(), boost_params, cv=5)

In [18]:
# boost_grid_search.fit(X_train, y_train)
# boost_grid_search.score(X_validate, y_validate)

### Random Forest

In [19]:
tree = RandomForestClassifier()
tree.fit(X_train, y_train)
tree.score(X_validate, y_validate)

0.7288135593220338

In [20]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

logit = LogisticRegression(C=10).fit(X_train, y_train)

train['predicted'] = logit.predict(X_train)
validate['predicted'] = logit.predict(X_validate)
test['predicted'] = logit.predict(X_test)

# Evaluate

In [21]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 99.15%
---
Confusion Matrix
actual      C++  Java  JavaScript  Python
predicted                                
C++          89     1           0       1
Java          1    86           0       0
JavaScript    0     0          91       0
Python        0     0           0      84
---
              precision    recall  f1-score   support

         C++       0.98      0.99      0.98        90
        Java       0.99      0.99      0.99        87
  JavaScript       1.00      1.00      1.00        91
      Python       1.00      0.99      0.99        85

    accuracy                           0.99       353
   macro avg       0.99      0.99      0.99       353
weighted avg       0.99      0.99      0.99       353



In [22]:
print('Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.predicted))

Accuracy: 73.73%
---
Confusion Matrix
actual      C++  Java  JavaScript  Python
predicted                                
C++          23     6           2       1
Java          1    15           1       2
JavaScript    3     1          26       2
Python        3     7           2      23
---
              precision    recall  f1-score   support

         C++       0.72      0.77      0.74        30
        Java       0.79      0.52      0.62        29
  JavaScript       0.81      0.84      0.83        31
      Python       0.66      0.82      0.73        28

    accuracy                           0.74       118
   macro avg       0.74      0.74      0.73       118
weighted avg       0.75      0.74      0.73       118



In [23]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 72.03%
---
Confusion Matrix
actual      C++  Java  JavaScript  Python
predicted                                
C++          22     7           3       2
Java          3    15           1       0
JavaScript    2     4          23       1
Python        3     3           4      25
---
              precision    recall  f1-score   support

         C++       0.65      0.73      0.69        30
        Java       0.79      0.52      0.62        29
  JavaScript       0.77      0.74      0.75        31
      Python       0.71      0.89      0.79        28

    accuracy                           0.72       118
   macro avg       0.73      0.72      0.72       118
weighted avg       0.73      0.72      0.71       118

