# Modeling

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score

import main
import prepare

## Read in the dataset

In [2]:
df = pd.read_csv('../../data/prepared/clean_readmes.csv')

In [3]:
df.head()

Unnamed: 0,language,readme,lemmatized,clean,words,watchers,stars,forks,commits
0,JavaScript,"FCC League-For-Good\nThis is a free, open-sour...",fcc leagueforgood this is a free opensource we...,fcc leagueforgood free opensource web applicat...,"['fcc', 'leagueforgood', 'free', 'opensource',...",28,161,98,411
1,JavaScript,nba\nNode.js client for nba.com API endpoints\...,nba nodejs client for nbacom api endpoint npm ...,nba nodejs client nbacom api endpoint npm inst...,"['nba', 'nodejs', 'client', 'nbacom', 'api', '...",49,621,150,294
2,JavaScript,SportsLeague: Laravel 5.4 based system for var...,sportsleague laravel 54 based system for vario...,sportsleague laravel 54 based system various s...,"['sportsleague', 'laravel', '54', 'based', 'sy...",6,26,48,9
3,JavaScript,Team Colors\n\nTeam Colors is a reference of H...,team color team color is a reference of hex rg...,team color team color reference hex rgb cmyk p...,"['team', 'color', 'team', 'color', 'reference'...",11,123,58,184
4,JavaScript,vue-sports\n\nA Vue.js project\n\n仿凤凰新闻体育板块+赛事...,vuesports a vuejs project vuex vuex build setu...,vuesports vuejs project vuex vuex build setup ...,"['vuesports', 'vuejs', 'project', 'vuex', 'vue...",5,93,42,16


## Transform text data using TF-IDF

In [4]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df.readme)
X = pd.concat([df[['watchers', 'stars', 'forks', 'commits']], pd.DataFrame(X_tfidf.todense())], axis=1)
y = df.language

In [5]:
repo_stats = X.iloc[:, :4]

## Split the data

In [6]:
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, stratify=y_train_validate, test_size=0.25, random_state=1)

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_validate.shape)
print(y_validate.shape)
print(X_test.shape)
print(y_test.shape)

(353, 21453)
(353,)
(118, 21453)
(118,)
(118, 21453)
(118,)


## Cross Validation

In [8]:
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

In [9]:
grid_search = GridSearchCV(LogisticRegression(), parameters, cv=3)

In [10]:
grid_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

KeyboardInterrupt: 

In [None]:
grid_search.score(X_test, y_test)

In [None]:
grid_search.best_estimator_

In [None]:
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = SVC(C=100, gamma=0.1).fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

# Evaluate

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))