In [1]:
import numpy as np
import pandas as pd

import wrangle as w

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, \
    confusion_matrix,\
    classification_report

import warnings
warnings.filterwarnings("ignore")

In [2]:
readmes = w.post_explore_wrangle_github_repositories()

In [3]:
readmes.head(3)

Unnamed: 0,repo,language,readme_contents,cleaned_readme_contents
0,huggingface/transformers,Python,<!---\nCopyright 2020 The HuggingFace Team. Al...,copyright 2020 huggingface team right reserved...
1,apachecn/ailearning,Python,"<p align=""center"">\n <a href=""https://www.a...",p aligncenter hrefhttpswwwapachecnorg img widt...
2,google-research/bert,Python,"# BERT\n\n**\*\*\*\*\* New March 11th, 2020: S...",bert new march 11th 2020 smaller bert model re...


In [4]:
# remove null rows (adding this to prep stage)
# readmes = readmes[readmes.cleaned_readme_contents.isna() == False]

## Split data

In [5]:
train, validate, test = w.train_split(readmes)

In [29]:
readmes.shape, train.shape, validate.shape, test.shape

((499, 4), (349, 4), (100, 4), (50, 4))

In [7]:
X_train = train.cleaned_readme_contents
y_train = train.language
X_validate = validate.cleaned_readme_contents
y_validate = validate.language
X_test = test.cleaned_readme_contents
y_test = test.language

## Baseline

In [8]:
train.language.value_counts(normalize=True)

Python    0.598854
HTML      0.401146
Name: language, dtype: float64

In [9]:
train.language.value_counts(normalize=True)[0]

0.5988538681948424

In [10]:
baseline = 'Python'

In [11]:
accuracy = train.language.value_counts(normalize=True)[0]
acc_validate = validate.language.value_counts(normalize=True)[0]
metric_df = pd.DataFrame(data=[
    {
        'model' : 'baseline',
        'accuracy' : accuracy,
        'acc-validate' : acc_validate,
        'difference' : acc_validate - accuracy,
    }
])

In [12]:
metric_df

Unnamed: 0,model,accuracy,acc-validate,difference
0,baseline,0.598854,0.6,0.001146


## The baseline is 'Python' with an accuacy of 60%

# Modeling

## CountVectorizer models

In [13]:
# make a Count Vectorizer object
cv = CountVectorizer()
# use the thing
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.8681948424068768

In [14]:
# bag_of_words_cv

In [15]:
# make a Tfidf Vectorizer object
tfidf = TfidfVectorizer()
# use the thing
bag_of_words_tfidf = tfidf.fit_transform(X_train)

In [16]:
cv = CountVectorizer(ngram_range=(1, 3))
bag_of_grams = cv.fit_transform(X_train)

In [17]:
tfidf = CountVectorizer(ngram_range=(2, 3))
bag_of_grams = tfidf.fit_transform(X_train)

In [18]:
for i in range(1,4):
    for j in range(3,8):
        cv = CountVectorizer(ngram_range=(i, i))
        X_bow = cv.fit_transform(X_train)
        X_val_bow = cv.transform(X_validate)
        tree = DecisionTreeClassifier(max_depth=j)
        tree.fit(X_bow, y_train)
        accuracy = tree.score(X_bow, y_train)
        acc_validate = tree.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'decistion_tree-cv_{i}gram_{j}depth',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [19]:
metric_df

Unnamed: 0,model,accuracy,acc-validate,difference
0,baseline,0.598854,0.6,0.001146
1,decistion_tree-cv_1gram_3depth,0.868195,0.9,0.031805
2,decistion_tree-cv_1gram_4depth,0.882521,0.9,0.017479
3,decistion_tree-cv_1gram_5depth,0.91404,0.88,-0.03404
4,decistion_tree-cv_1gram_6depth,0.936963,0.88,-0.056963
5,decistion_tree-cv_1gram_7depth,0.954155,0.9,-0.054155
6,decistion_tree-cv_2gram_3depth,0.805158,0.74,-0.065158
7,decistion_tree-cv_2gram_4depth,0.833811,0.76,-0.073811
8,decistion_tree-cv_2gram_5depth,0.853868,0.77,-0.083868
9,decistion_tree-cv_2gram_6depth,0.868195,0.78,-0.088195


In [20]:
for i in range(1,4):
    for j in range(3,8):
        cv = CountVectorizer(ngram_range=(i, i))
        X_bow = cv.fit_transform(X_train)
        X_val_bow = cv.transform(X_validate)
        tree = RandomForestClassifier(max_depth=j)
        tree.fit(X_bow, y_train)
        accuracy = tree.score(X_bow, y_train)
        acc_validate = tree.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'random_forest-cv_{i}gram_{j}depth',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [21]:
for i in range(1,4):
    for j in range(3,8):
        cv = CountVectorizer(ngram_range=(i, i))
        X_bow = cv.fit_transform(X_train)
        X_val_bow = cv.transform(X_validate)
        model = KNeighborsClassifier(n_neighbors=j)
        model.fit(X_bow, y_train)
        accuracy = model.score(X_bow, y_train)
        acc_validate = model.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'k_nearest-cv_{i}gram_{j}neighbors',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [22]:
for i in range(1,4):
    cv = CountVectorizer(ngram_range=(i, i))
    X_bow = cv.fit_transform(X_train)
    X_val_bow = cv.transform(X_validate)
    model = LogisticRegression()
    model.fit(X_bow, y_train)
    accuracy = model.score(X_bow, y_train)
    acc_validate = model.score(X_val_bow, y_validate)

    metric_df = metric_df.append(
        {
            'model' : f'logistic_regress-cv_{i}gram',
            'accuracy' : accuracy,
            'acc-validate' : acc_validate,
            'difference' : acc_validate - accuracy,
        }, ignore_index=True
    )

# TfidfVectorizer Models

In [23]:
for i in range(1,4):
    for j in range(3,8):
        tfidf = TfidfVectorizer(ngram_range=(i, i))
        X_bow = tfidf.fit_transform(X_train)
        X_val_bow = tfidf.transform(X_validate)
        tree = DecisionTreeClassifier(max_depth=j)
        tree.fit(X_bow, y_train)
        accuracy = tree.score(X_bow, y_train)
        acc_validate = tree.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'decistion_tree-tfidf_{i}gram_{j}depth',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [24]:
for i in range(1,4):
    for j in range(3,8):
        tfidf = TfidfVectorizer(ngram_range=(i, i))
        X_bow = tfidf.fit_transform(X_train)
        X_val_bow = tfidf.transform(X_validate)
        tree = RandomForestClassifier(max_depth=j)
        tree.fit(X_bow, y_train)
        accuracy = tree.score(X_bow, y_train)
        acc_validate = tree.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'random_forest-tfidf_{i}gram_{j}depth',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [25]:
for i in range(1,4):
    for j in range(3,8):
        tfidf = TfidfVectorizer(ngram_range=(i, i))
        X_bow = tfidf.fit_transform(X_train)
        X_val_bow = tfidf.transform(X_validate)
        model = KNeighborsClassifier(n_neighbors=j)
        model.fit(X_bow, y_train)
        accuracy = model.score(X_bow, y_train)
        acc_validate = model.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'k_nearest-tfidf_{i}gram_{j}neighbors',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [26]:
for i in range(1,4):
    tfidf = TfidfVectorizer(ngram_range=(i, i))
    X_bow = tfidf.fit_transform(X_train)
    X_val_bow = tfidf.transform(X_validate)
    model = LogisticRegression()
    model.fit(X_bow, y_train)
    accuracy = model.score(X_bow, y_train)
    acc_validate = model.score(X_val_bow, y_validate)

    metric_df = metric_df.append(
        {
            'model' : f'logistic_regress-tfidf_{i}gram',
            'accuracy' : accuracy,
            'acc-validate' : acc_validate,
            'difference' : acc_validate - accuracy,
        }, ignore_index=True
    )

In [27]:
metric_df

Unnamed: 0,model,accuracy,acc-validate,difference
0,baseline,0.598854,0.60,0.001146
1,decistion_tree-cv_1gram_3depth,0.868195,0.90,0.031805
2,decistion_tree-cv_1gram_4depth,0.882521,0.90,0.017479
3,decistion_tree-cv_1gram_5depth,0.914040,0.88,-0.034040
4,decistion_tree-cv_1gram_6depth,0.936963,0.88,-0.056963
...,...,...,...,...
92,k_nearest-tfidf_3gram_6neighbors,0.401146,0.40,-0.001146
93,k_nearest-tfidf_3gram_7neighbors,0.401146,0.40,-0.001146
94,logistic_regress-tfidf_1gram,0.954155,0.79,-0.164155
95,logistic_regress-tfidf_2gram,0.971347,0.60,-0.371347


In [28]:
metric_df.sort_values('acc-validate', ascending=False).head(15)

Unnamed: 0,model,accuracy,acc-validate,difference
50,decistion_tree-tfidf_1gram_4depth,0.899713,0.91,0.010287
52,decistion_tree-tfidf_1gram_6depth,0.939828,0.9,-0.039828
2,decistion_tree-cv_1gram_4depth,0.882521,0.9,0.017479
5,decistion_tree-cv_1gram_7depth,0.954155,0.9,-0.054155
53,decistion_tree-tfidf_1gram_7depth,0.95702,0.9,-0.05702
49,decistion_tree-tfidf_1gram_3depth,0.868195,0.9,0.031805
1,decistion_tree-cv_1gram_3depth,0.868195,0.9,0.031805
51,decistion_tree-tfidf_1gram_5depth,0.919771,0.88,-0.039771
3,decistion_tree-cv_1gram_5depth,0.91404,0.88,-0.03404
4,decistion_tree-cv_1gram_6depth,0.936963,0.88,-0.056963


### Ran 96 models

### decistion_tree-tfidf_1gram_4depth is our best model with 
- training-accuracy: 0.899713	
- validation-accuracy: 0.91	
- accuracy difference: 0.010287

- decistion_tree-tfidf_1gram_4depth	0.899713	0.91	0.010287
- decistion_tree-tfidf_1gram_6depth	0.939828	0.90	-0.039828
- decistion_tree-cv_1gram_4depth	0.882521	0.90	0.017479

In [57]:
def get_model_tree_tfidf_1gram_4depth(X_train, X_validate, y_train, y_validate):
    tfidf = TfidfVectorizer(ngram_range=(1, 1))
    X_bow = tfidf.fit_transform(X_train)
    X_val_bow = tfidf.transform(X_validate)
    tree = DecisionTreeClassifier(max_depth=4)
    tree.fit(X_bow, y_train)
    accuracy = tree.score(X_bow, y_train)
    acc_validate = tree.score(X_val_bow, y_validate)
        
    print('test model : Decistion_Tree : TfidfVectorizer : 1gram : 4_max_depth')
    print(f'train accuracy : {accuracy}'),
    print(f'validation accuracy : {acc_validate}'),
    print(f'difference : {acc_validate - accuracy}')
    
    return tree

In [58]:
def get_model_tree_tfidf_1gram_6depth(X_train, X_validate, y_train, y_validate):
    tfidf = TfidfVectorizer(ngram_range=(1, 1))
    X_bow = tfidf.fit_transform(X_train)
    X_val_bow = tfidf.transform(X_validate)
    tree = DecisionTreeClassifier(max_depth=6)
    tree.fit(X_bow, y_train)
    accuracy = tree.score(X_bow, y_train)
    acc_validate = tree.score(X_val_bow, y_validate)
        
    print('test model : Decistion_Tree : TfidfVectorizer : 1gram : 6_max_depth')
    print(f'train accuracy : {accuracy}'),
    print(f'validation accuracy : {acc_validate}'),
    print(f'difference : {acc_validate - accuracy}')
    
    return tree

In [59]:
def get_model__tree_cv_1gram_4depth(X_train, X_validate, y_train, y_validate):
    cv = CountVectorizer(ngram_range=(1, 1))
    X_bow = cv.fit_transform(X_train)
    X_val_bow = cv.transform(X_validate)
    tree = DecisionTreeClassifier(max_depth=4)
    tree.fit(X_bow, y_train)
    accuracy = tree.score(X_bow, y_train)
    acc_validate = tree.score(X_val_bow, y_validate)
    
    print('test model : Decistion_Tree : CountVectorizer : 1gram : 4_max_depth')
    print(f'train accuracy : {accuracy}'),
    print(f'validation accuracy : {acc_validate}'),
    print(f'difference : {acc_validate - accuracy}')
    
    return tree

In [60]:
dt_t_1_4 = get_model_tree_tfidf_1gram_4depth(
    X_train, X_validate, y_train, y_validate)

test model : Decistion_Tree : TfidfVectorizer : 1gram : 4_max_depth
train accuracy : 0.8997134670487106
validation accuracy : 0.9
difference : 0.0002865329512894421


In [61]:
dt_t_1_6 = get_model_tree_tfidf_1gram_6depth(
    X_train, X_validate, y_train, y_validate)

test model : Decistion_Tree : TfidfVectorizer : 1gram : 6_max_depth
train accuracy : 0.9426934097421203
validation accuracy : 0.89
difference : -0.052693409742120334


In [62]:
dt_c_1_4 = get_model__tree_cv_1gram_4depth(
    X_train, X_validate, y_train, y_validate)

test model : Decistion_Tree : CountVectorizer : 1gram : 4_max_depth
train accuracy : 0.8825214899713467
validation accuracy : 0.91
difference : 0.027478510028653313


## Test dataset

In [66]:
def get_model_test_tfifd_1(X_train, y_train, X_test, y_test, model):
    tfidf = TfidfVectorizer(ngram_range=(1, 1))
    X_bow = tfidf.fit_transform(X_train)
    X_test_bow = tfidf.transform(X_test)
    accuracy = model.score(X_bow, y_train)
    acc_test = model.score(X_test_bow, y_test)

    print('test model : Test_data-decistion_tree-tfidf_1gram_4depth')
    print(f'train accuracy : {accuracy}'),
    print(f'acc-test : {acc_test}'),
    print(f'difference : {acc_test - accuracy}')

In [71]:
get_model_test_tfifd_1(X_train, y_train, X_test, y_test, dt_t_1_4)

test model : Test_data-decistion_tree-tfidf_1gram_4depth
train accuracy : 0.8997134670487106
acc-test : 0.76
difference : -0.13971346704871057
