In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import requests
import re
import warnings
warnings.filterwarnings("ignore")

import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from prepare import missing_values_table

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,"When the young people returned to the ballroom, it presented a decidedly changed appea...",-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewhat silent, her eyes resting on Dolly wit...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as quickly as it came, and two days after th...",-0.580118,0.476676
3,dd1000b26,,,"And outside before the palace a great garden was walled round, filled full of stately ...",-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who lived together in a house of their own in ...,0.247197,0.510845


In [4]:
df.shape


(2834, 6)

In [5]:
df.drop_duplicates()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,"When the young people returned to the ballroom, it presented a decidedly changed appea...",-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewhat silent, her eyes resting on Dolly wit...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as quickly as it came, and two days after th...",-0.580118,0.476676
3,dd1000b26,,,"And outside before the palace a great garden was walled round, filled full of stately ...",-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who lived together in a house of their own in ...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files/2011/06/dinosaurs_45_text.pdf,CC BY-SA 3.0,"When you think of dinosaurs and where they lived, what do you picture? Do you see hot,...",1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_Elements/Solids,CC BY-SA 3.0,So what is a solid? Solids are usually hard because their molecules have been packed t...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_Elements/Liquids,CC BY-SA 3.0,The second state of matter we will discuss is a liquid. Solids are hard things you can...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Elementary_School/Solids,CC BY-SA 3.0,"Solids are shapes that you can actually touch. They have three dimensions, which means...",-0.215279,0.514128


In [6]:
# Set id as index
df.set_index('id', inplace = True)

In [7]:
# no duplicates found
# check for NaNs

missing_values_table(df)

Your selected dataframe has 5 columns and 2834 Rows.
There are 2 columns that have NULL values.


Unnamed: 0,Zero Values,null_count,% of Total Values,Total Zeroes + Null Values,% Total Zero + Null Values,Data Type
url_legal,0,2004,70.7,2004,70.7,object
license,0,2004,70.7,2004,70.7,object
excerpt,0,0,0.0,0,0.0,object
target,1,0,0.0,1,0.0,float64
standard_error,1,0,0.0,1,0.0,float64


over 2000 nulls are present in the url_legal and in the license...let's drop these

In [8]:
df.drop(columns=['url_legal', 'license'], inplace=True)

In [9]:
df.head()

Unnamed: 0_level_0,excerpt,target,standard_error
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c12129c31,"When the young people returned to the ballroom, it presented a decidedly changed appea...",-0.340259,0.464009
85aa80a4c,"All through dinner time, Mrs. Fayre was somewhat silent, her eyes resting on Dolly wit...",-0.315372,0.480805
b69ac6792,"As Roger had predicted, the snow departed as quickly as it came, and two days after th...",-0.580118,0.476676
dd1000b26,"And outside before the palace a great garden was walled round, filled full of stately ...",-1.054013,0.450007
37c1b32fb,Once upon a time there were Three Bears who lived together in a house of their own in ...,0.247197,0.510845


In [10]:
#find narratives with nulls
null_narrative = df[df['excerpt'].isnull()].index

In [11]:
#Clean up quotes

def clean_nlp(df):
    
    'A simple function to cleanup text data'
    
    ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt', '\n', 'ha', 'n', '41']
    
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')+ ADDITIONAL_STOPWORDS
    narrative = (unicodedata.normalize('NFKD', narrative)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', narrative).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]