In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from importlib import reload

import acquire as a 
import prepare as p
import explore as e

from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import model as m

In [2]:
df = a.acquire_readmes()

In [3]:
df = p.prep_df_for_nlp(df,'readme_contents',extra_words=p.EXTRA_WORDS)

In [4]:
train, validate, test = p.split_data(df, 'language')

In [5]:
train

Unnamed: 0,repo,language,readme_contents,clean,stem,lemmatized,username,lemmatized_len
482,awesome-actions,Not Listed,"<p align=""center"">\n <br>\n <img src=""awes...",awesome actions awesome github actions status ...,awesom action awesom github action statu sdra ...,awesome action awesome github action status sd...,sdras,17532
646,InstaPy,Python,"<p align=""center"">\n <img src=""https://i.imgu...",instapy tooling automates social media interac...,instapi tool autom social media interact farm ...,instapy tooling automates social medium intera...,InstaPy,872
474,osquery,Other,"# osquery\n\n<p align=""center"">\n<img alt=""osq...",osquery altosquery logo width200 src osquery s...,osqueri altosqueri logo width200 src osqueri s...,osquery altosquery logo width200 src osquery s...,osquery,3064
545,Best-App,Not Listed,\nBest App\n----\n\n*经常会有朋友想知道有哪些 Apps 或 服务 是值...,best app apps bestapp ios app app starpull req...,best app app bestapp io app app starpul reques...,best app apps bestapp io app app starpull requ...,hzlzh,3646
539,actix-web,Other,actix-web/README.md,actixweb readmemd,actixweb readmemd,actixweb readmemd,actix,17
...,...,...,...,...,...,...,...,...
47,sly,JavaScript,# [Sly](http://darsa.in/sly)\n\nJavaScript lib...,sly javascript library onedirectional scrollin...,sli javascript librari onedirect scroll item b...,sly javascript library onedirectional scrollin...,darsain,1238
268,free,Not Listed,更新时间 2023-01-12 00:00 \n所有免费节点都爬取自网络，请勿用于非法用途...,20230112 0000 android windows v2ray v2rayng v2...,20230112 0000 android window v2ray v2rayng v2r...,20230112 0000 android window v2ray v2rayng v2r...,freefq,13112
230,hello-algorithm,Java,## 简介\n\nEnglish version repo and Gitbook is o...,english repo gitbook english branch part1 part...,english repo gitbook english branch part1 part...,english repo gitbook english branch part1 part...,geekxh,659
304,jq,Other,README.md,readmemd,readmemd,readmemd,stedolan,8


In [9]:
def encode_has_language(df):
    df['has_java'] = df.lemmatized.str.contains('java')
    df['has_javascript'] = df.lemmatized.str.contains('javascript')
    df['has_python'] = df.lemmatized.str.contains('python')
    df['has_typescript'] = df.lemmatized.str.contains('typescript')
    df['has_awesome'] = df.repo.str.contains('awesome')
    df['has_react'] = df.repo.str.contains('react')
    df['has_go'] = df.repo.str.contains('go')
    
    return df

In [10]:
def encode_for_model(train, validate, test):
    train = encode_has_language(train)
    validate = encode_has_language(validate)
    test = encode_has_language(test)
    return train, validate, test

In [11]:
train, validate, test = encode_for_model(train, validate, test)

In [12]:
train

Unnamed: 0,repo,language,readme_contents,clean,stem,lemmatized,username,lemmatized_len,has_java,has_javascript,has_python,has_typescript,has_awesome
482,awesome-actions,Not Listed,"<p align=""center"">\n <br>\n <img src=""awes...",awesome actions awesome github actions status ...,awesom action awesom github action statu sdra ...,awesome action awesome github action status sd...,sdras,17532,True,True,True,True,True
646,InstaPy,Python,"<p align=""center"">\n <img src=""https://i.imgu...",instapy tooling automates social media interac...,instapi tool autom social media interact farm ...,instapy tooling automates social medium intera...,InstaPy,872,False,False,True,False,False
474,osquery,Other,"# osquery\n\n<p align=""center"">\n<img alt=""osq...",osquery altosquery logo width200 src osquery s...,osqueri altosqueri logo width200 src osqueri s...,osquery altosquery logo width200 src osquery s...,osquery,3064,False,False,False,False,False
545,Best-App,Not Listed,\nBest App\n----\n\n*经常会有朋友想知道有哪些 Apps 或 服务 是值...,best app apps bestapp ios app app starpull req...,best app app bestapp io app app starpul reques...,best app apps bestapp io app app starpull requ...,hzlzh,3646,False,False,False,False,False
539,actix-web,Other,actix-web/README.md,actixweb readmemd,actixweb readmemd,actixweb readmemd,actix,17,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,sly,JavaScript,# [Sly](http://darsa.in/sly)\n\nJavaScript lib...,sly javascript library onedirectional scrollin...,sli javascript librari onedirect scroll item b...,sly javascript library onedirectional scrollin...,darsain,1238,True,True,False,False,False
268,free,Not Listed,更新时间 2023-01-12 00:00 \n所有免费节点都爬取自网络，请勿用于非法用途...,20230112 0000 android windows v2ray v2rayng v2...,20230112 0000 android window v2ray v2rayng v2r...,20230112 0000 android window v2ray v2rayng v2r...,freefq,13112,False,False,False,False,False
230,hello-algorithm,Java,## 简介\n\nEnglish version repo and Gitbook is o...,english repo gitbook english branch part1 part...,english repo gitbook english branch part1 part...,english repo gitbook english branch part1 part...,geekxh,659,True,False,False,False,False
304,jq,Other,README.md,readmemd,readmemd,readmemd,stedolan,8,False,False,False,False,False


In [14]:
tfidf = TfidfVectorizer()

In [16]:
X_tfidf = tfidf.fit_transform(train.lemmatized)
tf_idf_train = pd.DataFrame(X_tfid.todense(), 
             columns=tfidf.get_feature_names_out())

In [17]:
idf_values = pd.Series(
    dict(
        zip(
            tfidf.get_feature_names_out(), tfidf.idf_)))

In [54]:
def get_idf(df):
    tfidf = TfidfVectorizer()
    bag_of_words = tfidf.fit_transform(df.lemmatized)
    pd.DataFrame(bag_of_words.todense(), 
                 columns=tfidf.get_feature_names_out())
    idf_values = pd.Series(
    dict(
        zip(
            tfidf.get_feature_names_out(), tfidf.idf_)))
    return idf_values.describe()

In [55]:
get_idf(train)

count    46023.000000
mean         6.140426
std          0.651265
min          1.359252
25%          6.402677
50%          6.402677
75%          6.402677
max          6.402677
dtype: float64

In [44]:
idf_values = pd.Series(
    dict(
        zip(
            tfidf.get_feature_names_out(), tfidf.idf_)))

In [47]:
idf_values.describe()

In [49]:
import pprint

In [51]:
pprint.pprint(idf_describe)

count    46023.000000
mean         6.140426
std          0.651265
min          1.359252
25%          6.402677
50%          6.402677
75%          6.402677
max          6.402677
dtype: float64


In [20]:
y_train = train.language

In [26]:
tree = DecisionTreeClassifier(max_depth=3, random_state=27)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

0.5733634311512416

In [24]:
tfidf_bigrams = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_bigrams = tfidf_bigrams.fit_transform(train.lemmatized)
pd.DataFrame(X_tfidf_bigrams.todense(), 
             columns=tfidf_bigrams.get_feature_names_out())

Unnamed: 0,00,00 00,00 000000b0,00 01,00 03,00 05,00 0505,00 10,00 11,00 20,...,zypper fd,zypper flameshot,zypper hub,zypper si,zypper sqlitebrowser,zypper sudo,zzh1996,zzh1996 volltin,zzrotdesign,zzrotdesign dockergc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
tree2 = DecisionTreeClassifier(max_depth=7, random_state=27)
tree2.fit(X_tfidf_bigrams, y_train)
tree2.score(X_tfidf_bigrams, y_train)

0.7584650112866818

In [35]:
reload(e)

<module 'explore' from '/Users/fostermark/codeup-data-science/nlp-project/explore.py'>

In [36]:
e.tfidf_df(train)



In [None]:
def tfidf_df(df):
    tfidf = TfidfVectorizer()
    bag_of_words = tfidf.fit_transform(df.lemmatized)
    pd.DataFrame(bag_of_words.todense(), 
                 columns=tfidf.get_feature_names())