In [6]:
import pandas as pd
import numpy as np
import re
import unicodedata
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import nltk.sentiment
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from requests import get
from bs4 import BeautifulSoup
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
import re
import time

In [4]:
train = pd.read_csv('train.csv', index_col=0)

In [5]:
train

Unnamed: 0_level_0,Title,Genre,Description
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...
54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [28]:
def basic_clean(string):
    '''
    This function accepts a string as an input
    then lowercases everything, normalizes unicode
    characters, and replaces anything that is
    not a letter, number, whitespace, 
    or a single quote.
    '''
    cleaned = string.lower()
    cleaned = unicodedata.normalize('NFKD', cleaned)    .encode('ascii', 'ignore')    .decode('utf-8', 'ignore')
    cleaned = re.sub(r"[^a-z0-9'\s]", '', cleaned)
    
    return cleaned


# In[3]:


def tokenize(string):
    '''
    This function takes in a string as an input
    then tokenizes all words in the string.
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)


# In[4]:


def stem(string):
    '''
    This function takes in a string as an input
    then stems all words in the string.
    '''
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    string_stemmed = ' '.join(stems)
    return string_stemmed


# In[5]:


def lemmatize(string):
    '''
    This function takes in a string as an input
    then lemmatizes all words in the string.
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    string_lemmatized = ' '.join(lemmas)

    return string_lemmatized


# In[6]:


def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string as an input
    then removes stopwords. The function has two
    additional parameters that define additional
    stopwords to remove in extra_words as a list,
    and defines stopwords to exclude from removal
    in exlude_words as a list. extra_words and
    exclude_words are empty lists by default.
    '''
    stopword_list = stopwords.words('english')
    stopword_list = set(stopword_list) - set(exclude_words)
    stopword_list = stopword_list.union(set(extra_words))
    
    words = string.split()
    
    filtered_words = [w for w in words if w not in stopword_list]

    #print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    #print('---')

    string_without_stopwords = ' '.join(filtered_words)

    return string_without_stopwords


# In[7]:
#extra_stops = ['server', 'run', '&#9;', "' ", " '", "'",'Minecraft','minecraft','minecraft ',' minecraft', 'abstract','and','arguments','assert','break','byte','case','char','class',
##               'const','continue','default','double','else','enum','extends','false','final','finally','float','for',
#               'goto','if','implements','import','in','instanceof','int','interface','long','native','new','null',
#               'package','pass','private','protected','public','raise','return','short','static','super','switch',
#               'synchronized','this','throw','throws','transient','true','try','void','volatile','while','with',
#               'yield', 'http', 'com', 'github', 'www', 'version', 'file']

extra_stops = ["' ", " '", "'", " ' "]

def prep_movie_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the repo name, original text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df = df.dropna()
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    df['lemmatized'] = df['lemmatized'].str.replace(r"[\"\',]", '')
    
    #df = map_other_languages(df)
    
    return df


#
#def map_other_languages(df):
#    '''
#    This function takes in a df with 'languages' column
#    containing the coding language of the repo. Any language
#    that is not Python, Java, or JavaScript will be marked
#    as 'Other'
#    '''
#    top_languages = ['Python', 'Java', 'JavaScript']
#    df.loc[~df['language'].isin(top_languages), 'language'] = 'Other'
    
    return df

In [12]:
def get_wordcount_bar(train):
    '''
    This function takes in the training dataset and creates a bar plot of the
    average wordcount of repository based on their language type
    '''
    #Make a column on the df for word count
    train['word_count'] = train.lemmatized.str.split().apply(len)
    #Use groupby to get an average length per language
    language_wordcount = train.groupby('language').word_count.mean().sort_values(ascending=False)
    #Set style, make a chart
    sns.set_style("darkgrid")
    fig, axes = plt.subplots(figsize=(9, 6))
    ax = sns.barplot(x=language_wordcount.values, 
                 y=language_wordcount.index, palette='Set3')
    plt.title('Average Wordcount of Languages in Readme Files')
    plt.xlabel("Average Word Count")
    plt.ylabel('Language')
    plt.show()

In [29]:
train = prep_movie_data(train, 'Description')

  df['lemmatized'] = df['lemmatized'].str.replace(r"[\"\',]", '')


In [30]:
train.head(3)

Unnamed: 0_level_0,Title,Genre,Description,clean,lemmatized
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening conversation doctor parents 10yearol...,listening conversation doctor parent 10yearold...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,brother sister past incestuous relationship cu...,brother sister past incestuous relationship cu...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,bus empties students field trip museum natural...,bus empty student field trip museum natural hi...


In [31]:
train.to_csv('train_prepped.csv')

In [15]:
test = pd.read_csv('test.csv', index_col=0)

In [16]:
test = prep_movie_data(test, 'Description')

In [18]:
test.head(3)

Unnamed: 0_level_0,Title,Description,clean,lemmatized
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar...",lr brane loves life car apartment job especial...,lr brane love life car apartment job especiall...
2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch...",spain march 1964 quico naughty child three bel...,spain march 1964 quico naughty child three bel...
3,Off the Beaten Track (2010),One year in the life of Albin and his family ...,one year life albin family shepherds north tra...,one year life albin family shepherd north tran...
