In [45]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
from env import github_token, github_username
import unicodedata
import nltk
import prepare as p
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import model as m
from sklearn.tree import DecisionTreeClassifier

# CodeUp-DS-NLP-Project
 
### Project Goals 
* The goal of this classification project is to first identify key words of the programming language and create a machine learning model that can effectly predict the programming language used.
### The Plan
* Aquire ReadMe data from GitHub repositories via webscraping.
* Prepare data for exploration by:
    * Convert text to all lower case for normalcy.
    * Remove any accented characters, non-ASCII characters.
    * Remove special characters.
    * Lemmatize the words.
    * Remove stopwords.
    * Store the clean text and the original text for use in future notebooks.
#### Explore data in search of key features with the basic following questions:
* What are the most common words in READMEs?
* Does the length of the README vary by programming language?
* Do different programming languages use a different number of unique words?
* Are there any words that uniquely identify a programming language?
#### Develop a Model to predict happiness score
* Use key words identified to build predictive models of different types
* Evaluate models on train and validate data samples
* Select the best model based on accuracy
* Evaluate the best model on test data samples
#### Draw conclusions

### Steps to Reproduce
* Clone this repo.
* Acquire the data from GitHub
* Put the data in the file containing the cloned repo.
* Run notebook
### Conclusions
* Decision Tree model Accuracy scores:
    
        * 0.704762 on training data samples
        * 0.637363 on validate data samples
        * 0.671052 on test data samples
        
#### Key TakeAway:
    Decision Tree model was successful on all train, validate and test data sets. 
### Recommendations

   * Consider aquiring larger "text" datasets
   * Consider hyperparameter tunning
   * Consider gradient boosting algorithims

In [None]:

#url = "https://github.com/search?3&q=stars%3A%3E0&s=stars&type=Repositories"
#reqs = requests.get(url)
#soup = BeautifulSoup(reqs.text, 'html.parser')
#urls = []
#for link in soup.find_all('a',class_="v-align-middle"):
#    print(link.get('href'))

In [None]:
# remember the lesson that Rosy showed you
#for i in range(1,50):
#    print(i)
#    time.sleep(10)

In [None]:

#url = "https://github.com/search?3&q=stars%3A%3E0&s=stars&type=Repositories"
#reqs = requests.get(url)
#soup = BeautifulSoup(reqs.text, 'html.parser')
#urls = []
#for link in soup.find_all('a',class_="v-align-middle"):
#    urls.append(link.get('href'))

In [None]:
#urls_repo = pd.read_csv('urls.csv', index_col=0)
#urls_repo['0']

In [None]:
REPOS = urls_repo['0']

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        readme_contents = requests.get(readme_download_url).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


#if __name__ == "__main__":
#    data = scrape_github_data()
#    json.dump(data, open("data.json", "w"), indent=1)

In [None]:
#urls_df = scrape_github_data()

In [None]:
#df=pd.DataFrame(urls_df)
df = pd.read_csv('readme_df.csv')
df

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df = df[(df.language == 'Java') | (df.language=='JavaScript') | (df.language=='Python') | (df.language=='TypeScript')]

In [None]:
df.reset_index(drop = True, inplace= True)

In [None]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    # we will normalize our data into standard NFKD unicode, feed it into an ascii encoding
    # decode it back into UTF-8
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    # utilize our regex substitution to remove our undesirable characters, then lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
    return string

In [None]:
#inshort_df[‘clean_text’] = inshort_df.content.apply(clean).apply(' ’.join)

In [None]:
df['clean_text']= df.readme_contents.apply(p.basic_clean)

# start here after scrapping
### get dataframe from csv and clean using prep functions

In [11]:
# acquire readme data
df = pd.read_csv('readme_df.csv')

In [12]:
# cleaning data
df = p.data_prep(df)

# prepare text for exploration 
df = p.text_prep(df)


# split data: train, validate and test
train, validate, test = p.split_data(df, 'language')

### RATIOS

In [None]:
labels = pd.concat([df.language.value_counts(),
                    df.language.value_counts(normalize=True)], axis=1)
labels.columns = ['n', 'percent']
labels


In [None]:
def show_counts_and_ratios(df, column):
    """
    Takes in a dataframe and a string of a single column
    Returns a dataframe with absolute value counts and percentage value counts
    """
    labels = pd.concat([df[column].value_counts(),
                    df[column].value_counts(normalize=True)], axis=1)
    labels.columns = ['n', 'percent']
    labels
    return labels




In [None]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # make our tokenizer, taken from nltk's ToktokTokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # apply our tokenizer's tokenization to the string being input, ensure it returns a string
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

In [None]:
df['clean_text']= df.clean_text.apply(p.tokenize)

In [None]:
df

In [None]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # create our lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # use a list comprehension to lemmatize each word
    # string.split() => output a list of every token inside of the document
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # glue the lemmas back together by the strings we split on
    string = ' '.join(lemmas)
    #return the altered document
    return string

In [None]:
df['clean_text']= df.clean_text.apply(p.lemmatize)

In [None]:
extra_words = ["'"]

In [None]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # assign our stopwords from nltk into stopword_list
    stopword_list = stopwords.words('english')
    # utilizing set casting, i will remove any excluded stopwords
    stopword_list = set(stopword_list) - set(exclude_words)
    # add in any extra words to my stopwords set using a union
    stopword_list = stopword_list.union(set(extra_words))
    # split our document by spaces
    words = string.split()
    # every word in our document, as long as that word is not in our stopwords
    filtered_words = [word for word in words if word not in stopword_list]
    # glue it back together with spaces, as it was so it shall be
    string_without_stopwords = ' '.join(filtered_words)
    # return the document back
    return string_without_stopwords

In [None]:
df['clean_text']= df.clean_text.apply(p.remove_stopwords)

In [None]:
df

In [None]:
def split_data(df, target):
    """"
    split_date takes in a dataframe  and target variable and splits into train , validate, test
    and stratifies on target variable
    The split is 20% test 80% train/validate. Then 30% of 80% validate and 70% of 80% train.
    Aproximately (train 56%, validate 24%, test 20%)
    returns train, validate, and test
    """
    # split test data from train/validate
    train_validate, test = train_test_split(df, test_size=.2,
                                        random_state=123,
                                        stratify=df[target])
    # split train from validate
    train, validate = train_test_split(train_validate, test_size=.3,
                                   random_state=123,
                                   stratify=train_validate[target])
    return train, validate, test

In [None]:
train, validate, test = split_data(df, 'language')

In [None]:
train.language.nunique()

In [None]:
train

In [None]:
# setting basic style parameters for matplotlib
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

In [None]:
JavaScript_words = ' '.join(train[train.language == 'JavaScript'].clean_text).split(' ')
Java_words = ' '.join(train[train.language == 'Java'].clean_text).split(' ')
Python_words = ' '.join(train[train.language == 'Python'].clean_text).split(' ')
TypeScript_words = ' '.join(train[train.language == 'TypeScript'].clean_text).split(' ')
All_words = ' '.join(train.clean_text).split(' ')

In [None]:
All_words

In [None]:
JavaScript_freq = pd.Series(JavaScript_words).value_counts()
Java_freq = pd.Series(Java_words).value_counts()
Python_freq = pd.Series(Python_words).value_counts()
TypeScript_freq = pd.Series(TypeScript_words).value_counts()
All_words_freq = pd.Series(All_words).value_counts()

In [None]:
JavaScript_freq.head(10)

In [None]:
Java_freq.head(15)

In [None]:
Python_freq.head(10)

In [None]:
TypeScript_freq.head(10)

In [None]:
word_counts = (pd.concat([JavaScript_freq, Java_freq, Python_freq, TypeScript_freq, All_words_freq], axis=1, sort=True)
                .set_axis(['JavaScript', 'Java', 'Python', 'TypeScript', 'AllWords'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))


In [None]:
word_counts.head(50)

In [None]:
# t test of length of documnet number of words verse prog lang

### TOP 10 ALL Words

In [None]:
top_words_cloud = word_counts.sort_values(by='AllWords', ascending=False).head(50)


In [None]:
top_words_cloud= top_words_cloud.index.to_list()

In [None]:
top_words_cloud

In [None]:
top_words_cloud = " ".join(top_words_cloud)

In [None]:
top_words_cloud

### create word cloud

In [None]:
from wordcloud import WordCloud


img = WordCloud(background_color='white',colormap='Accent').generate(top_words_cloud)
# WordCloud() produces an image object, which can be displayed with plt.imshow
plt.imshow(img)
# axis aren't very useful for a word cloud
plt.axis('off')

In [None]:
all_cloud = WordCloud(background_color='white', height=1000, width=400).generate(' '.join(all_words))
ham_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(ham_words))
spam_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(spam_words))

plt.figure(figsize=(10, 8))
axs = [plt.axes([0, 0, .5, 1]), plt.axes([.5, .5, .5, .5]), plt.axes([.5, 0, .5, .5])]

axs[0].imshow(all_cloud)
axs[1].imshow(ham_cloud)
axs[2].imshow(spam_cloud)

axs[0].set_title('All Words')
axs[1].set_title('Ham')
axs[2].set_title('Spam')

for ax in axs: ax.axis('off')

### Top 10 Words unique to Python Vs JavaScript

In [None]:
unique_df = pd.concat([word_counts[word_counts.JavaScript == 0].sort_values(by='Python').tail(10),
           word_counts[word_counts.Python == 0].sort_values(by='JavaScript').tail(10)])


In [None]:
# figure out the percentage of spam vs ham
(word_counts
 .assign(p_spam=word_counts.spam / word_counts['all'],
         p_ham=word_counts.ham / word_counts['all'])
 .sort_values(by='all')
 [['p_spam', 'p_ham']]
 .tail(20)
 .sort_values('p_ham')
 .plot.barh(stacked=True))

plt.title('Proportion of Spam vs Ham for the 20 most common words')


In [None]:
# figure out the percentage of spam vs ham
(word_counts
 .assign(p_python=word_counts.Python / word_counts['AllWords'],
         p_javascript=word_counts.JavaScript / word_counts['AllWords'])
 .sort_values(by='AllWords')
 [['p_python', 'p_javascript']]
 .tail(20)
 .sort_values('p_javascript')
 .plot.barh(stacked=True))

plt.title('Proportion of Python vs JavaScript for the 20 most common words')

In [None]:
(word_counts
 [(word_counts.Python > 10) & (word_counts.JavaScript > 10)]
 .assign(ratio=lambda df: df.Python / (df.JavaScript + .01))
 .sort_values(by='ratio')
 .pipe(lambda df: pd.concat([df.head(), df.tail()])))


In [None]:
top_20_JavaScript_bigrams = (pd.Series(nltk.ngrams(JavaScript_words, 2))
                      .value_counts()
                      .head(20))

top_20_JavaScript_bigrams.head()


In [None]:
top_20_JavaScript_bigrams.sort_values(ascending=False).plot.barh(color='pink', width=.9, figsize=(10, 6))

plt.title('20 Most frequently occuring JavaScript bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurances')

# make the labels pretty
ticks, _ = plt.yticks()
labels = top_20_JavaScript_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)


In [None]:
top_20_Python_bigrams = (pd.Series(nltk.ngrams(Python_words, 2))
                      .value_counts()
                      .head(20))

top_20_Python_bigrams.head()

In [None]:
top_20_Python_bigrams.sort_values(ascending=False).plot.barh(color='pink', width=.9, figsize=(10, 6))

plt.title('20 Most frequently occuring Python bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurances')

# make the labels pretty
ticks, _ = plt.yticks()
labels = top_20_Python_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)


In [None]:
word_counts['raw_count'] = word_counts.AllWords
#.assign(frequency=lambda df: df.raw_count / df.raw_count.sum()).assign(augmented_frequency=lambda df: df.frequency / df.frequency.max())

In [None]:
word_counts['frequency'] = word_counts.raw_count / word_counts.raw_count.sum()

In [None]:
word_counts['augmented_frequency'] = word_counts.frequency / word_counts.frequency.max()

In [None]:
word_counts

### Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# same basic process as any sklearn transformation:
# make the thing
cv = CountVectorizer()
# use the thing
bag_of_words = cv.fit_transform(train.clean_text)

In [None]:
bag_of_words

In [None]:

bow = pd.DataFrame(bag_of_words.todense())
bow.columns = cv.get_feature_names()

In [None]:
bow

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
bag_of_words = tfidf.fit_transform(train.clean_text)

pd.DataFrame(bag_of_words.todense(), 
             columns=tfidf.get_feature_names())

In [None]:
bag_of_features = pd.Series(
    dict(
        zip(
            tfidf.get_feature_names(), tfidf.idf_
        )
    )
)

In [None]:
# the higher the score does not mean the more frequent it appears.
# what it means is that, tha particular word has more weight in determining language
bag_of_features.sort_values(ascending=False).head(10)

In [None]:
cv = CountVectorizer(ngram_range=(2, 3))
bag_of_grams = cv.fit_transform(train.clean_text)

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X_train = train.clean_text
y_train = train.language

X_validate = validate.clean_text
y_validate = validate.language

X_test = test.clean_text
y_test = test.language


In [None]:
X_train

### Decision Tree

In [None]:
# Whatever transformations we apply to X_train need to be applied to X_test
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

In [None]:
# Whatever transformations we apply to X_train need to be applied to X_test
X_bow_val = cv.transform(X_validate)
tree.score(X_bow_val, y_validate)

### Random Forest

In [None]:
# Whatever transformations we apply to X_train need to be applied to X_test
cv1 = CountVectorizer()
X_bow1 = cv1.fit_transform(X_train)
rf = RandomForestClassifier(max_depth =6, 
                            min_samples_leaf = 2, 
                            random_state=123)
rf.fit(X_bow1, y_train)
rf.score(X_bow1, y_train)

In [None]:
# Whatever transformations we apply to X_train need to be applied to X_test
X_bow_val = cv1.transform(X_validate)
rf.score(X_bow_val, y_validate)

### KNN

In [None]:
cv2 = CountVectorizer()
X_bow = cv2.fit_transform(X_train)
knn = KNeighborsClassifier(n_neighbors=6, weights='uniform')
knn.fit(X_bow, y_train)
knn.score(X_bow, y_train)

In [None]:
X_bow_val = cv2.transform(X_validate)
knn.score(X_bow_val, y_validate)

# Start Here with models...Final Run

In [13]:
X_train,y_train,X_validate,y_validate, X_test, y_test = m.model_prep(train, validate, test)

In [14]:
cv = m.cv_countvectorizer(X_train)

In [15]:
DecisionTree_Train,DecisionTree_Validate=m.get_tree(X_train,y_train,X_validate,y_validate, X_test,y_test,cv)

In [16]:
KNN_Train,KNN_Validate = m.get_knn(X_train,y_train,X_validate,y_validate, X_test,y_test,cv)

In [17]:
RandomForest_Train,RandomForest_Validate =m.get_forest(X_train,y_train,X_validate,y_validate, X_test,y_test,cv)

In [None]:
evaluate_df

In [None]:
final_eval(KNN_Train, KNN_Validate, RandomForest_Train, RandomForest_Validate, DecisionTree_Train, DecisionTree_Validate, evaluate_df)

In [None]:
DecisionTree_Validate

In [None]:
DecisionTree_Train,DecisionTree_Validate=m.get_tree(X_train,y_train,X_validate,y_validate, X_test,y_test,cv)

In [None]:
KNN_Train,KNN_Validate = m.get_knn(X_train,y_train,X_validate,y_validate, X_test,y_test,cv)

In [None]:
RandomForest_Train,RandomForest_Validate =m.get_forest(X_train,y_train,X_validate,y_validate, X_test,y_test,cv)

In [39]:
models = ['DecisionTree_Train', 'DecisionTree_Validate', 'RandomForest_Train', 'RandomForest_Validate', 'KNN_Train', 'KNN_Validate']
def make_stats_df():
    '''
    Function creates dataframe for results of pearsonsr statistical 
    test for all features.
    '''
    evaluate_df = pd.DataFrame()
    evaluate_df['Models'] = models
    return evaluate_df

def final_eval(a, b, c, d, e, f, df):
    


    scores = [a, b, c, d, e, f]
    df['Scores']=scores
    
    return df

In [40]:
evaluate_df = make_stats_df()

### Evaluate Models

In [41]:
final_eval(DecisionTree_Train, DecisionTree_Validate, RandomForest_Train, RandomForest_Validate, KNN_Train, KNN_Validate, evaluate_df)

Unnamed: 0,Models,Scores
0,DecisionTree_Train,0.704762
1,DecisionTree_Validate,0.637363
2,RandomForest_Train,0.62381
3,RandomForest_Validate,0.494505
4,KNN_Train,0.580952
5,KNN_Validate,0.461538


### Modeling Summary

#### Baseline Score is 45% (0.45)
* The DecisionTree Model out-performed other models on train and validate data sets 
    * .704762
    * .637363
    
* The KNN Model came in dead last only slightly out-performing the baseline score
    * .580952
    * .461538
    
* The ideal model is expected to have out-performed the baseline score & have the highest accuracy score in comparison to other models.
    * For this reason DecisionTree model will now be fit to the test data set 

In [1]:
def get_tree_test(x_train, y_train, x_validate, y_validate, x_test, y_test,cv):
    '''
    Function gets Decision Tree model accuracy on train and validate data set 
    ''' 
    # create decision tree model using defaults and random state to replicate results
    tree = DecisionTreeClassifier(max_depth=3,radom_state=123)

    # fit model on training data
    X_bow = cv.fit_transform(x_train)
    tree.fit(X_bow, y_train)
    train_score= tree.score(X_bow, y_train)
    
    # fit model on validate data
    X_bow_val = cv.transform(x_validate)
    val_score =tree.score(X_bow_val, y_validate)

    # fit model on test data
    X_bow_test = cv.transform(x_test)
    
    test_score =tree.score(X_bow_test, y_test)

    #return train_score, val_score, test_score
    print('Accuracy of Decision Tree classifier model on test set: {:.2f}'
      .format(test_score))

### Decision Tree Model on Test

In [2]:
get_tree_test(X_train, y_train, X_validate, y_validate, X_test, y_test,cv)

NameError: name 'X_train' is not defined

In [69]:
test_score

0.6710526315789473

### Conclusions
* Decision Tree model Accuracy scores:
    
        * 0.704762 on training data samples
        * 0.637363 on validate data samples
        * 0.671052 on test data samples
        
#### Key TakeAway:
    Decision Tree model was successful on all train, validate and test data sets. 

### Recommendations

   * Consider aquiring larger "text" datasets
   * Consider hyperparameter tunning
   * Consider gradient boosting algorithims