In [2]:
#custome modules
import acquire as a
import wrangle as w

#visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# import standard libraries
import pandas as pd
import re

# import file managers
from os.path import isfile
import pickle

# import json handler
import json

# import language detector
from langdetect import detect

import nltk
import unicodedata
import re

#split
from sklearn.model_selection import train_test_split

#modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natasharivers/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/natasharivers/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<hr style="border:2px solid black"> </hr>

# Acquire/Prepare

In [3]:
df= w.wrangle_github_repos(new_pickles=False, get_new_links=False,
                                             number_of_pages=25)

In [4]:
df.head()

AttributeError: 'tuple' object has no attribute 'head'

In [None]:
#take a look at the df
df.info()

In [None]:
df.shape

In [None]:
def clean(text):
    '''
    This function is a simplified text cleaning function
    '''
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return re.sub(r"[^a-z0-9\s]", '', text)

In [None]:
#see the count for each language
df.programming_language.value_counts()

#python appears to be the most used language

In [None]:
#take a look at all python 
df[df.programming_language == 'Python']

In [None]:
#python clean only
python_words = (' '.join(df[df.programming_language == 'Python'].cleaned_readme))
python_words

In [None]:
#clean java only
java_words = (' '.join(df[df.programming_language == 'JavaScript'].cleaned_readme))

#clean Jupyter Notebook only
jupyter_words = (' '.join(df[df.programming_language == 'Jupyter Notebook'].cleaned_readme))

#html clean only
html_words = (' '.join(df[df.programming_language == 'HTML'].cleaned_readme))

#typescript clean only
type_words = (' '.join(df[df.programming_language == 'TypeScript'].cleaned_readme))

#r clean only
r_words = (' '.join(df[df.programming_language == 'R'].cleaned_readme))

#all clean text
all_words = (' '.join(df.cleaned_readme))

<hr style="border:2px solid black"> </hr>

# Explore

In [None]:
all_words.split()

In [None]:
#turn all words into a series
#get value count of all words
pd.Series(all_words.split()).value_counts()

#removing stopwords would change this count

In [None]:
#create a series with words split and value counts for each word
python_freq = pd.Series(python_words.split()).value_counts()
java_freq = pd.Series(java_words.split()).value_counts()
jupyter_freq = pd.Series(jupyter_words.split()).value_counts()
html_freq = pd.Series(html_words.split()).value_counts()
type_freq = pd.Series(type_words.split()).value_counts()
r_freq = pd.Series(r_words.split()).value_counts()
all_freq = pd.Series(all_words.split()).value_counts()

In [None]:
python_freq

In [None]:
#combine all frequency into single Series
word_counts = pd.concat([python_freq, java_freq, jupyter_freq, html_freq, type_freq, r_freq, all_freq], axis=1).fillna(0).astype(int)

#word count for each language type
word_counts.columns = ['python', 'java', 'jupyter', 'html', 'typescript', 'r', 'all']

In [None]:
#sorted by most common to most common in 'all' 
word_counts.sort_values('all', ascending=False).head(20)

<hr style="border:1px solid black"> </hr>

# Visualize

In [None]:
#dictate size of figure
plt.rc('figure', figsize=(14, 8))
plt.style.use('seaborn-darkgrid')

In [None]:
#using 'all' languages
#visualize top 10
word_counts.sort_values('all', ascending=False).head(10)[['python', 'java', 'jupyter', 'html', 'typescript', 'r']].plot.barh()

plt.title('Top 10 words by Language')
plt.xlabel('Frequency')
plt.ylabel('Word')

In [None]:
#visual proportions
#axis=1 in .apply means row by row
word_counts.sort_values(by='all', ascending=False).head(10).apply(lambda row: row/ row['all'],axis=1)

In [None]:
#visualize all languages as percent of 'all'
(word_counts.sort_values(by='all', ascending=False)
 .head(10)
 .apply(lambda row: row/ row['all'],axis=1)
 .drop(columns='all')
 .plot.barh(stacked=True))
plt.title('% of 10  most common words by language')
None

<hr style="border:1px solid black"> </hr>

# n-grams

### Python

In [None]:
#see most common bigrams in python languages
pd.Series(nltk.bigrams(python_words.split())).value_counts()

### JavaScript

In [None]:
#see most common bigrams in JavaScript language
pd.Series(nltk.bigrams(java_words.split())).value_counts()

### Jupyter Notebook

In [None]:
#see most common bigrams in JupyterNotebook language
pd.Series(nltk.bigrams(jupyter_words.split())).value_counts()

### HTML

In [None]:
#see most common bigrams in HTML language
pd.Series(nltk.bigrams(html_words.split())).value_counts()

### TypeScript

In [None]:
#see most common bigrams in TypeScript language
pd.Series(nltk.bigrams(type_words.split())).value_counts()

### R

In [None]:
#see most common bigrams in R language
pd.Series(nltk.bigrams(r_words.split())).value_counts()

In [None]:
#visualize top ten for all languages
pd.Series(nltk.bigrams(all_words.split())).value_counts().head(10).plot.barh()
plt.title('Top 10 most common all languages bigrams')
plt.ylabel('Bigram')
plt.xlabel('Frequency')
None

<hr style="border:1px solid black"> </hr>

# Word Clouds

### Python

In [None]:
img = WordCloud(background_color='white', width=800, height=600).generate(python_words)
plt.imshow(img)
plt.axis('off')

### JavaScript

In [None]:
img = WordCloud(background_color='white', width=800, height=600).generate(java_words)
plt.imshow(img)
plt.axis('off')

### Jupyter Notebook

In [None]:
img = WordCloud(background_color='white', width=800, height=600).generate(jupyter_words)
plt.imshow(img)
plt.axis('off')

### HTML

In [None]:
img = WordCloud(background_color='white', width=800, height=600).generate(html_words)
plt.imshow(img)
plt.axis('off')

### TypeScript

In [None]:
img = WordCloud(background_color='white', width=800, height=600).generate(type_words)
plt.imshow(img)
plt.axis('off')

### R

In [None]:
img = WordCloud(background_color='white', width=800, height=600).generate(r_words)
plt.imshow(img)
plt.axis('off')

<hr style="border:2px solid black"> </hr>

# Model

In [None]:
df.head()

In [None]:
#term frequency
cv = CountVectorizer()
#identify X
X = cv.fit_transform(df.cleaned_readme)
#identify target (language)
y = df.programming_language

In [None]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

In [None]:
#find value counts to find baseline language
y_train.value_counts()

In [None]:
df.head()

In [None]:
baseline_accuracy = (y_train == 'Python').mean()
print(f'Baseline accuracy is: {baseline_accuracy:.2%}')

## Decision Tree Model

In [None]:
#create model - decision tree classifier
tree = DecisionTreeClassifier(max_depth=5, random_state=123)

#fit
tree.fit(X_train, y_train)

#get scores 
print(f'training score for Decision Tree(max_depth =5): {tree.score(X_train, y_train):.2%}')
print(f'test score for Decision Tree(max_depth =5): {tree.score(X_test, y_test):.2%}')

In [None]:
#create model - decision tree classifier
tree2 = DecisionTreeClassifier(max_depth=3, random_state=123)

#fit the model
tree2.fit(X_train, y_train)

#get scores 
print(f'training score for Decision Tree(max_depth =3): {tree2.score(X_train, y_train):.2%}')
print(f'test score for Decision Tree(max_depth =3): {tree2.score(X_test, y_test):.2%}')

In [None]:
#create model - decision tree classifier
tree3 = DecisionTreeClassifier(max_depth=10, random_state=123)

#fit the model
tree3.fit(X_train, y_train)

#get scores 
print(f'training score for Decision Tree(max_depth =10): {tree3.score(X_train, y_train):.2%}')
print(f'test score for Decision Tree(max_depth =10): {tree3.score(X_test, y_test):.2%}')

## Logistic Regression

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.cleaned_readme)
y = df.programming_language

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))
lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Try on Lemmatized readme instead of cleaned readme

In [None]:
#term frequency
cv_lem = CountVectorizer()
#identify X
X_lem = cv_lem.fit_transform(df.lemmatized_readme)
#identify target (language)
y_lem = df.programming_language