In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import prepare as prp

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickolaspedrimiranda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickolaspedrimiranda/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
repo = pd.read_csv('all_repos.csv')
repo = prp.cleanse(repo, 'readme_contents')
repo.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
0,ScottfreeLLC/AlphaPy,Python,AlphaPy\n=======\n\n|badge_pypi| |badge_downlo...,alphapy badgepypi badgedownloads badgedocs bad...,alphapi badgepypi badgedownload badgedoc badge...,alphapy badgepypi badgedownloads badgedocs bad...
1,jalapic/engsoccerdata,R,"Latest GitHub version: 11/4/2022, v0.1.7\n\nN...",latest github version 1142022 v017 nov 2022 up...,latest github version 1142022 v017 nov 2022 up...,latest github version 1142022 v017 nov 2022 up...
2,bttmly/nba,JavaScript,# nba\n*Node.js client for nba.com API endpoin...,nba nodejs client nbacom api endpoints npm ins...,nba nodej client nbacom api endpoint npm insta...,nba nodejs client nbacom api endpoint npm inst...
3,kyleskom/NBA-Machine-Learning-Sports-Betting,Python,# NBA Sports Betting Using Machine Learning 🏀\...,nba sports betting using machine learning img ...,nba sport bet use machin learn img srchttpsgit...,nba sport betting using machine learning img s...
4,linouk23/NBA-Player-Movements,Python,# NBA Player Movements\n\nThis is a script for...,nba player movements script visualization nba ...,nba player movement script visual nba game raw...,nba player movement script visualization nba g...


In [3]:
def replace_values_not_in_list(series, mylist, false_value):
    new_values = []
    for value in series:
        if value in mylist:
            new_values.append(value)
        else:
            new_values.append(false_value)
    return new_values

In [4]:
def top_languages(n):
    top_languages = list(repo.language.value_counts().index[:n])
    filtered_languages = replace_values_not_in_list(repo.language, top_languages, 'Other')
    return filtered_languages

In [5]:
repo.language = top_languages(4)

In [6]:
repo.language.value_counts()

Other         341
Python        232
JavaScript    141
R              43
TypeScript     40
Name: language, dtype: int64

In [7]:
train, val, test = prp.train_val_test(repo, 'language', stratify=True)

(557, 6) (120, 6) (120, 6)


In [8]:
X_train, y_train = prp.split_xy(train, 'language')
X_val, y_val = prp.split_xy(val, 'language')
X_test, y_test = prp.split_xy(test, 'language')

In [9]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train.clean)
X_val_tfidf = tfidf.transform(X_val.clean)
X_test_tfidf = tfidf.transform(X_test.clean)

In [10]:
lm = LogisticRegression(class_weight='balanced')

lm.fit(X_train_tfidf, y_train)

In [11]:
y_train_res = pd.DataFrame({'actual':y_train,
                            'preds':lm.predict(X_train_tfidf)})


y_val_res = pd.DataFrame({'actual':y_val,
                          'preds':lm.predict(X_val_tfidf)})

In [12]:
y_train_res.head()

Unnamed: 0,actual,preds
536,Other,Other
193,R,R
148,Python,Python
150,Python,Python
239,JavaScript,JavaScript


In [13]:
y_val_res.head()

Unnamed: 0,actual,preds
614,Other,Other
204,Other,Other
67,Python,Python
341,Other,Other
256,Python,Other


In [14]:
baseline = pd.DataFrame(y_train)

In [15]:
baseline['baseline'] = 'Other'

In [16]:
baseline

Unnamed: 0,language,baseline
536,Other,Other
193,R,Other
148,Python,Other
150,Python,Other
239,JavaScript,Other
...,...,...
763,Other,Other
753,Other,Other
415,Python,Other
230,Python,Other


In [17]:
baseline.language.value_counts()

Other         238
Python        162
JavaScript     99
R              30
TypeScript     28
Name: language, dtype: int64

In [18]:
print(f'baseline {round((baseline.language == baseline.baseline).sum()/(len(baseline)),2)}')

baseline 0.43


In [28]:
repo = pd.read_csv('all_repos.csv')
repo = prp.cleanse(repo, 'readme_contents')

top_languages = list(repo.language.value_counts().index[:4])
filtered_languages = replace_values_not_in_list(repo.language, top_languages, 'Other')
repo.language = filtered_languages

train, val, test = prp.train_val_test(repo, 'language', stratify=True, print_shape=False)

X_train, y_train = prp.split_xy(train, 'language')
X_val, y_val = prp.split_xy(val, 'language')
X_test, y_test = prp.split_xy(test, 'language')

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train.clean)
X_val_tfidf = tfidf.transform(X_val.clean)
X_test_tfidf = tfidf.transform(X_test.clean)

lm = LogisticRegression(class_weight='balanced')

lm.fit(X_train_tfidf, y_train)

y_train_res = pd.DataFrame({'actual':y_train,
                        'preds':lm.predict(X_train_tfidf)})


y_val_res = pd.DataFrame({'actual':y_val,
                          'preds':lm.predict(X_val_tfidf)})

print(f'train {round((y_train_res.actual == y_train_res.preds).sum()/(len(y_train_res)),2)}')
print(f'val   {round((y_val_res.actual == y_val_res.preds).sum()/(len(y_val_res)),2)}')
print()

train 0.97
val   0.66



# Second try

In [62]:
repo = pd.read_csv('all_repos.csv')
repo = prp.cleanse(repo, 'readme_contents')
repo.language = prp.top_languages(repo, 4)

all_words = prp.word_counts(repo, 'language', 'clean')

In [66]:
for n in range(0, 101, 5):
    word_filter = list(all_words[all_words['all'] < n].index)

    repo = pd.read_csv('all_repos.csv')
    repo.language = prp.top_languages(repo, 4)
    repo = prp.cleanse(repo, 'readme_contents', extra_words=word_filter)

    train, val, test = prp.train_val_test(repo, 'language', stratify=True, print_shape=False)

    X_train, y_train = prp.split_xy(train, 'language')
    X_val, y_val = prp.split_xy(val, 'language')
    X_test, y_test = prp.split_xy(test, 'language')

    tfidf = TfidfVectorizer()

    X_train_tfidf = tfidf.fit_transform(X_train.lemmatized)
    X_val_tfidf = tfidf.transform(X_val.lemmatized)
    X_test_tfidf = tfidf.transform(X_test.lemmatized)

    lm = LogisticRegression(class_weight='balanced')

    lm.fit(X_train_tfidf, y_train)

    y_train_res = pd.DataFrame({'actual':y_train,
                            'preds':lm.predict(X_train_tfidf)})


    y_val_res = pd.DataFrame({'actual':y_val,
                              'preds':lm.predict(X_val_tfidf)})

    print(f'train {round((y_train_res.actual == y_train_res.preds).sum()/(len(y_train_res)),2)}         {n} words')
    print(f'val   {round((y_val_res.actual == y_val_res.preds).sum()/(len(y_val_res)),2)}         {n} words')
    print()

train 0.96         0 words
val   0.66         0 words

train 0.95         5 words
val   0.64         5 words

train 0.93         10 words
val   0.66         10 words

train 0.91         15 words
val   0.64         15 words

train 0.9         20 words
val   0.63         20 words

train 0.89         25 words
val   0.65         25 words

train 0.88         30 words
val   0.66         30 words

train 0.88         35 words
val   0.67         35 words

train 0.87         40 words
val   0.65         40 words

train 0.86         45 words
val   0.64         45 words

train 0.86         50 words
val   0.62         50 words

train 0.85         55 words
val   0.62         55 words

train 0.85         60 words
val   0.61         60 words

train 0.85         65 words
val   0.6         65 words

train 0.85         70 words
val   0.63         70 words

train 0.83         75 words
val   0.61         75 words

train 0.82         80 words
val   0.58         80 words

train 0.82         85 words
val   0.5