In [6]:
# loading modules
import numpy as np 
import pandas as pd 
from pathlib import Path
from fastai.text import *
import warnings

In [7]:
warnings.simplefilter("ignore")

In [91]:
# Load the data
train = pd.read_csv('../data/Train.csv').dropna(0) # Read in train, ignoring one row with missing data
test = pd.read_csv('../data/Test.csv').fillna('') # Read in test
test['label']=0 # We'll fill this in with predictions later
train.head(3) # Take a peek at the data


Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0


### Cleaning test data

In [93]:
text = test["safe_text"]
text

0       <user> <user> ... &amp; 4 a vaccine given 2 he...
1       Students starting school without whooping coug...
2       I'm kinda over every ep of <user> being "rippe...
3       How many innocent children die for lack of vac...
4       CDC eyeing bird flu vaccine for humans, though...
                              ...                        
5172    jenny mccarthy is on new years rockin eve. wha...
5173    Measles reported in Clark Co. for 1st time sin...
5174    <user> issues alert regarding Measles in TX. K...
5175    I can't believe people don't vaccinate their k...
5176    "<user>  Alternatives to #Flu Vaccine <url> #n...
Name: safe_text, Length: 5177, dtype: object

In [95]:
# replacing unnccessary words
text = text.replace("<user>", '', regex=True)
text = text.replace("<url>", '', regex=True)

In [96]:
text.head(5)

0      ... &amp; 4 a vaccine given 2 healthy peeps,...
1    Students starting school without whooping coug...
2    I'm kinda over every ep of  being "ripped from...
3    How many innocent children die for lack of vac...
4    CDC eyeing bird flu vaccine for humans, though...
Name: safe_text, dtype: object

In [97]:
text = text.replace('[^a-zA-Z0-9 ]', '', regex=True)
text.head(5)

0       amp 4 a vaccine given 2 healthy peeps FDA t...
1    Students starting school without whooping coug...
2    Im kinda over every ep of  being ripped from t...
3    How many innocent children die for lack of vac...
4    CDC eyeing bird flu vaccine for humans though ...
Name: safe_text, dtype: object

In [98]:
text = text.astype(str)

In [100]:
text = text.apply(lambda x: x.lstrip())
text.head(5)

0       amp 4 a vaccine given 2 healthy peeps FDA thin...
1       Students starting school without whooping coug...
2       Im kinda over every ep of  being ripped from t...
3       How many innocent children die for lack of vac...
4       CDC eyeing bird flu vaccine for humans though ...
                              ...                        
5172    jenny mccarthy is on new years rockin eve what...
5173    Measles reported in Clark Co for 1st time sinc...
5174    issues alert regarding Measles in TX Keep your...
5175    I cant believe people dont vaccinate their kid...
5176    Alternatives to Flu Vaccine  natural health A ...
Name: safe_text, Length: 5177, dtype: object

In [101]:
text

0       amp 4 a vaccine given 2 healthy peeps FDA thin...
1       Students starting school without whooping coug...
2       Im kinda over every ep of  being ripped from t...
3       How many innocent children die for lack of vac...
4       CDC eyeing bird flu vaccine for humans though ...
                              ...                        
5172    jenny mccarthy is on new years rockin eve what...
5173    Measles reported in Clark Co for 1st time sinc...
5174    issues alert regarding Measles in TX Keep your...
5175    I cant believe people dont vaccinate their kid...
5176    Alternatives to Flu Vaccine  natural health A ...
Name: safe_text, Length: 5177, dtype: object

In [102]:
test["safe_text"] = text

In [103]:
test = test.dropna(axis=1, how='all')
test

Unnamed: 0,tweet_id,safe_text,label
0,00BHHHP1,amp 4 a vaccine given 2 healthy peeps FDA thin...,0
1,00UNMD0E,Students starting school without whooping coug...,0
2,01AXPTJF,Im kinda over every ep of being ripped from t...,0
3,01HOEQJW,How many innocent children die for lack of vac...,0
4,01JUKMAO,CDC eyeing bird flu vaccine for humans though ...,0
...,...,...,...
5172,ZXVVNC5O,jenny mccarthy is on new years rockin eve what...,0
5173,ZYIANVI8,Measles reported in Clark Co for 1st time sinc...,0
5174,ZYITEHAH,issues alert regarding Measles in TX Keep your...,0
5175,ZZ3BMBTG,I cant believe people dont vaccinate their kid...,0


## Cleaning training data

In [104]:
text = train["safe_text"]
text

0        Me &amp; The Big Homie meanboy3000 #MEANBOY #M...
1        I'm 100% thinking of devoting my career to pro...
2        #whatcausesautism VACCINES, DO NOT VACCINATE Y...
3        I mean if they immunize my kid with something ...
4        Thanks to <user> Catch me performing at La Nui...
                               ...                        
9996     Living in a time where the sperm I used to was...
9997     <user> <user>  In spite of all measles outbrea...
9998     Interesting trends in child immunization in Ok...
9999     CDC Says Measles Are At Highest Levels In Deca...
10000    Pneumonia vaccine: for women w risk of pulmona...
Name: safe_text, Length: 9999, dtype: object

In [105]:
# replacing unnccessary words
text = text.replace("<user>", '', regex=True)
text = text.replace("<url>", '', regex=True)

In [106]:
text = text.apply(lambda x: x.lstrip())
text.head(5)

0    Me &amp; The Big Homie meanboy3000 #MEANBOY #M...
1    I'm 100% thinking of devoting my career to pro...
2    #whatcausesautism VACCINES, DO NOT VACCINATE Y...
3    I mean if they immunize my kid with something ...
4    Thanks to  Catch me performing at La Nuit NYC ...
Name: safe_text, dtype: object

In [107]:
text = text.replace('[^a-zA-Z0-9 ]', '', regex=True)
text.head(5)

0    Me amp The Big Homie meanboy3000 MEANBOY MB MB...
1    Im 100 thinking of devoting my career to provi...
2    whatcausesautism VACCINES DO NOT VACCINATE YOU...
3    I mean if they immunize my kid with something ...
4    Thanks to  Catch me performing at La Nuit NYC ...
Name: safe_text, dtype: object

In [110]:
train["safe_text"] = text
train.head(5)

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me amp The Big Homie meanboy3000 MEANBOY MB MB...,0.0,1.0
1,E3303EME,Im 100 thinking of devoting my career to provi...,1.0,1.0
2,M4IVFSMS,whatcausesautism VACCINES DO NOT VACCINATE YOU...,-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to Catch me performing at La Nuit NYC ...,0.0,1.0
