In [43]:
import numpy as np
import pandas as pd

import wrangle as w

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, \
    confusion_matrix,\
    classification_report

import warnings
warnings.filterwarnings("ignore")

In [2]:
readmes = w.wrangle_github_repositories()

In [3]:
readmes.head(3)

Unnamed: 0,repo,language,readme_contents,cleaned_readme_contents
0,huggingface/transformers,Python,<!---\nCopyright 2020 The HuggingFace Team. Al...,copyright 2020 huggingface team right reserved...
1,apachecn/ailearning,Python,"<p align=""center"">\n <a href=""https://www.a...",p aligncenter hrefhttpswwwapachecnorg img widt...
2,google-research/bert,Python,"# BERT\n\n**\*\*\*\*\* New March 11th, 2020: S...",bert new march 11th 2020 smaller bert model re...


In [7]:
# remove null rows (adding this to prep stage)
readmes = readmes[readmes.cleaned_readme_contents.isna() == False]

## Split data

In [8]:
train, validate, test = w.train_split(readmes)

In [9]:
readmes.shape, train.shape, validate.shape, test.shape

((493, 4), (275, 4), (119, 4), (99, 4))

In [13]:
X_train = train.cleaned_readme_contents
y_train = train.language
X_validate = validate.cleaned_readme_contents
y_validate = validate.language
X_test = test.cleaned_readme_contents
y_test = test.language

## Baseline

In [19]:
train.language.value_counts(normalize=True)

Python    0.610909
HTML      0.389091
Name: language, dtype: float64

In [31]:
train.language.value_counts(normalize=True)[0]

0.610909090909091

In [20]:
baseline = 'Python'

In [44]:
accuracy = train.language.value_counts(normalize=True)[0]
acc_validate = validate.language.value_counts(normalize=True)[0]
metric_df = pd.DataFrame(data=[
    {
        'model' : 'baseline',
        'accuracy' : accuracy,
        'acc-validate' : acc_validate,
        'difference' : acc_validate - accuracy,
    }
])

In [45]:
metric_df

Unnamed: 0,model,accuracy,acc-validate,difference
0,baseline,0.610909,0.605042,-0.005867


## The baseline is 'Python' with an accuacy of 61%

# Modeling

In [14]:
# make a Count Vectorizer object
cv = CountVectorizer()
# use the thing
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.8836363636363637

In [11]:
bag_of_words_cv

<275x36883 sparse matrix of type '<class 'numpy.int64'>'
	with 85429 stored elements in Compressed Sparse Row format>

In [12]:
# make a Tfidf Vectorizer object
tfidf = TfidfVectorizer()
# use the thing
bag_of_words_tfidf = tfidf.fit_transform(X_train)

In [None]:
cv = CountVectorizer(ngram_range=(1, 3))
bag_of_grams = cv.fit_transform(X_train)

In [None]:
tfidf = CountVectorizer(ngram_range=(2, 3))
bag_of_grams = tfidf.fit_transform(X_train)

In [46]:
for i in range(1,4):
    for j in range(3,8):
        cv = CountVectorizer(ngram_range=(i, i))
        X_bow = cv.fit_transform(X_train)
        X_val_bow = cv.transform(X_validate)
        tree = DecisionTreeClassifier(max_depth=j)
        tree.fit(X_bow, y_train)
        accuracy = tree.score(X_bow, y_train)
        acc_validate = tree.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'decistion_tree-cv_{i}gram_{j}depth',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [47]:
metric_df

Unnamed: 0,model,accuracy,acc-validate,difference
0,baseline,0.610909,0.605042,-0.005867
1,decistion_tree-cv_1gram_3depth,0.883636,0.789916,-0.09372
2,decistion_tree-cv_1gram_4depth,0.923636,0.764706,-0.15893
3,decistion_tree-cv_1gram_5depth,0.949091,0.747899,-0.201192
4,decistion_tree-cv_1gram_6depth,0.963636,0.756303,-0.207334
5,decistion_tree-cv_1gram_7depth,0.970909,0.731092,-0.239817
6,decistion_tree-cv_2gram_3depth,0.792727,0.697479,-0.095248
7,decistion_tree-cv_2gram_4depth,0.825455,0.689076,-0.136379
8,decistion_tree-cv_2gram_5depth,0.854545,0.697479,-0.157066
9,decistion_tree-cv_2gram_6depth,0.872727,0.689076,-0.183652


In [48]:
for i in range(1,4):
    for j in range(3,8):
        cv = CountVectorizer(ngram_range=(i, i))
        X_bow = cv.fit_transform(X_train)
        X_val_bow = cv.transform(X_validate)
        tree = RandomForestClassifier(max_depth=j)
        tree.fit(X_bow, y_train)
        accuracy = tree.score(X_bow, y_train)
        acc_validate = tree.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'random_forest-cv_{i}gram_{j}depth',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [49]:
metric_df

Unnamed: 0,model,accuracy,acc-validate,difference
0,baseline,0.610909,0.605042,-0.005867
1,decistion_tree-cv_1gram_3depth,0.883636,0.789916,-0.09372
2,decistion_tree-cv_1gram_4depth,0.923636,0.764706,-0.15893
3,decistion_tree-cv_1gram_5depth,0.949091,0.747899,-0.201192
4,decistion_tree-cv_1gram_6depth,0.963636,0.756303,-0.207334
5,decistion_tree-cv_1gram_7depth,0.970909,0.731092,-0.239817
6,decistion_tree-cv_2gram_3depth,0.792727,0.697479,-0.095248
7,decistion_tree-cv_2gram_4depth,0.825455,0.689076,-0.136379
8,decistion_tree-cv_2gram_5depth,0.854545,0.697479,-0.157066
9,decistion_tree-cv_2gram_6depth,0.872727,0.689076,-0.183652


In [50]:
for i in range(1,4):
    for j in range(3,8):
        cv = CountVectorizer(ngram_range=(i, i))
        X_bow = cv.fit_transform(X_train)
        X_val_bow = cv.transform(X_validate)
        model = KNeighborsClassifier(n_neighbors=j)
        model.fit(X_bow, y_train)
        accuracy = model.score(X_bow, y_train)
        acc_validate = model.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'k_nearest-cv_{i}gram_{j}neighbors',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [51]:
for i in range(1,4):
    cv = CountVectorizer(ngram_range=(i, i))
    X_bow = cv.fit_transform(X_train)
    X_val_bow = cv.transform(X_validate)
    model = LogisticRegression()
    model.fit(X_bow, y_train)
    accuracy = model.score(X_bow, y_train)
    acc_validate = model.score(X_val_bow, y_validate)

    metric_df = metric_df.append(
        {
            'model' : f'logistic_regress-cv_{i}gram',
            'accuracy' : accuracy,
            'acc-validate' : acc_validate,
            'difference' : acc_validate - accuracy,
        }, ignore_index=True
    )

In [52]:
metric_df

Unnamed: 0,model,accuracy,acc-validate,difference
0,baseline,0.610909,0.605042,-0.005867
1,decistion_tree-cv_1gram_3depth,0.883636,0.789916,-0.09372
2,decistion_tree-cv_1gram_4depth,0.923636,0.764706,-0.15893
3,decistion_tree-cv_1gram_5depth,0.949091,0.747899,-0.201192
4,decistion_tree-cv_1gram_6depth,0.963636,0.756303,-0.207334
5,decistion_tree-cv_1gram_7depth,0.970909,0.731092,-0.239817
6,decistion_tree-cv_2gram_3depth,0.792727,0.697479,-0.095248
7,decistion_tree-cv_2gram_4depth,0.825455,0.689076,-0.136379
8,decistion_tree-cv_2gram_5depth,0.854545,0.697479,-0.157066
9,decistion_tree-cv_2gram_6depth,0.872727,0.689076,-0.183652


# TfidfVectorizer Models

In [46]:
for i in range(1,4):
    for j in range(3,8):
        cv = CountVectorizer(ngram_range=(i, i))
        X_bow = cv.fit_transform(X_train)
        X_val_bow = cv.transform(X_validate)
        tree = DecisionTreeClassifier(max_depth=j)
        tree.fit(X_bow, y_train)
        accuracy = tree.score(X_bow, y_train)
        acc_validate = tree.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'decistion_tree-_{i}gram_{j}depth',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [48]:
for i in range(1,4):
    for j in range(3,8):
        cv = CountVectorizer(ngram_range=(i, i))
        X_bow = cv.fit_transform(X_train)
        X_val_bow = cv.transform(X_validate)
        tree = RandomForestClassifier(max_depth=j)
        tree.fit(X_bow, y_train)
        accuracy = tree.score(X_bow, y_train)
        acc_validate = tree.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'random_forest-cv_{i}gram_{j}depth',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [50]:
for i in range(1,4):
    for j in range(3,8):
        tfidf = TfidfVectorizer(ngram_range=(i, i))
        X_bow = cv.fit_transform(X_train)
        X_val_bow = cv.transform(X_validate)
        model = KNeighborsClassifier(n_neighbors=j)
        model.fit(X_bow, y_train)
        accuracy = model.score(X_bow, y_train)
        acc_validate = model.score(X_val_bow, y_validate)
        
        metric_df = metric_df.append(
            {
                'model' : f'k_nearest-cv_{i}gram_{j}neighbors',
                'accuracy' : accuracy,
                'acc-validate' : acc_validate,
                'difference' : acc_validate - accuracy,
            }, ignore_index=True
        )

In [51]:
for i in range(1,4):
    tfidf = TfidfVectorizer(ngram_range=(i, i))
    X_bow = cv.fit_transform(X_train)
    X_val_bow = cv.transform(X_validate)
    model = LogisticRegression()
    model.fit(X_bow, y_train)
    accuracy = model.score(X_bow, y_train)
    acc_validate = model.score(X_val_bow, y_validate)

    metric_df = metric_df.append(
        {
            'model' : f'logistic_regress-cv_{i}gram',
            'accuracy' : accuracy,
            'acc-validate' : acc_validate,
            'difference' : acc_validate - accuracy,
        }, ignore_index=True
    )