# Step 3: Clean the data again for Combined Data
___
In here, we will load the combined vectorized data and clean over again to make sure that it is ready to proceed to NLP stage
___

## Import Libraries
___

In [311]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

pd.set_option('display.max_colwidth', 100)
%matplotlib inline

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

In [312]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Load Combined Datas
___
Both versions of the combined data set, combined_AA_and_SS_df & combined_AA_and_SS_df-new_stopwords, will be loaded and cleaned further before being modelled.

In [313]:
#load in data file and create a new dataframe instance
combined_df = pd.read_csv("../data/combined_AA_and_SS_df.csv")
combined_new_stopwords_df = pd.read_csv("../data/combined_AA_and_SS_df-new_stopwords.csv")

In [314]:
#create backup just in case data file is corrupted
combined_df.to_csv('../data/BACKUP/backup_combined_df.csv')
combined_new_stopwords_df.to_csv('../data/BACKUP/backup_combined_new_stopwords_df.csv')

In [315]:
#output sample of combined_df from top
combined_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,alcoholicsanonymous
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,alcoholicsanonymous
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,alcoholicsanonymous
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,alcoholicsanonymous
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,alcoholicsanonymous


In [316]:
#output sample of combined_df from bottom
combined_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking
1999,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking


In [317]:
#output sample of combined_new_stopwords_df from top
combined_new_stopwords_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,alcoholicsanonymous
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,alcoholicsanonymous
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,alcoholicsanonymous
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,alcoholicsanonymous
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,alcoholicsanonymous


In [318]:
#output sample of combined_new_stopwords_df from bottom
combined_new_stopwords_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking
1999,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,stopsmoking


In [319]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 1001 entries, 0 to subreddit
dtypes: int64(1000), object(1)
memory usage: 15.3+ MB


In [320]:
combined_new_stopwords_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 1001 entries, 0 to subreddit
dtypes: int64(1000), object(1)
memory usage: 15.3+ MB


In [321]:
combined_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.0065,0.009,0.02,0.0035,0.0035,0.0115,0.0035,0.005,0.0065,0.0045,...,0.0035,0.0305,0.005,0.0105,0.0055,0.0155,0.009,0.004,0.0185,0.0055
std,0.08638,0.099619,0.150371,0.059072,0.059072,0.146214,0.059072,0.089325,0.08638,0.074044,...,0.059072,0.183266,0.070551,0.106749,0.073976,0.155794,0.099619,0.063135,0.134784,0.073976
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,2.0,2.0,1.0,1.0,4.0,1.0,3.0,2.0,2.0,...,1.0,2.0,1.0,2.0,1.0,4.0,2.0,1.0,1.0,1.0


In [322]:
combined_new_stopwords_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.063,0.017,0.013,0.0455,0.014,0.0155,0.024,0.015,0.0115,0.015,...,0.036,0.0115,0.0115,0.0135,0.04,0.014,0.0285,0.0195,0.0085,0.0095
std,0.27031,0.147383,0.121813,0.245886,0.125745,0.135162,0.162595,0.129551,0.106646,0.1476,...,0.233947,0.115648,0.111238,0.127773,0.228968,0.121702,0.343142,0.138309,0.111508,0.111427
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,3.0,2.0,4.0,2.0,3.0,2.0,2.0,1.0,3.0,...,4.0,2.0,2.0,2.0,3.0,2.0,12.0,1.0,3.0,3.0


# Label Target Value
___

In [323]:
#create new dataframe instance as 'update1_combined_df' and 'update1_combined_new_stopwords_df' to prevent any errors
update1_combined_df = combined_df
update1_combined_new_stopwords_df = combined_new_stopwords_df

#change 'subreddit' values for 'alcoholicsanonymous' and 'stopsmoking'
#set alcoholicanonymous = 1 and stopsmoking = 0
update1_combined_df['subreddit'] = update1_combined_df['subreddit'].map({'alcoholicsanonymous':1, 'stopsmoking':0})
update1_combined_new_stopwords_df['subreddit'] = update1_combined_new_stopwords_df['subreddit'].map({'alcoholicsanonymous':1,\
                                                                                                     'stopsmoking':0})



In [324]:
#see output to make sure that the values have changed to what was mapped to
update1_combined_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [325]:
#see output to make sure that the values have changed to what was mapped to
update1_combined_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1999,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [326]:
#see output to make sure that the values have changed to what was mapped to
update1_combined_new_stopwords_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [327]:
#see output to make sure that the values have changed to what was mapped to
update1_combined_new_stopwords_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1999,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The new update1_combined_df & update1_combined_new_stopwords_df looks good. Except need to do cleaning like finding duplicates and null values

### Check for duplicates and null values
___


In [328]:
#check for duplicates
update1_combined_df.duplicated().sum()

551

In [329]:
#check for duplicates
update1_combined_new_stopwords_df.duplicated().sum()

438

In [330]:
#check for null values
update1_combined_df.isnull().sum().sum()

0

In [331]:
#check for null values
update1_combined_new_stopwords_df.isnull().sum().sum()

0

Because after concatenating, there are columns from alcoholicsanonymous that will not have words that are only related to stopsmoking and vise versa.  To fix this, a digit '0' will be place in.

### For base combined data set

In [332]:
#fill null values with 0
update1_combined_df.fillna(0, inplace=True)

In [333]:
#create a new dataframe instance to prevent errors
update2_combined_df = update1_combined_df

#check number of rows and columns
update2_combined_df.shape

(2000, 1001)

In [334]:
#dropping the duplicates just to make sure. It is not needed but better to be safe than sorry
update2_combined_df.drop_duplicates(inplace=True)

#check for duplicates again
update2_combined_df.duplicated().sum()

0

In [335]:
#see sample to make sure that the null values are replaced
update2_combined_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [336]:
#see sample to make sure that the null values are replaced
update2_combined_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
1993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1999,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [337]:
#check number of rows and columns
update2_combined_df.shape

(1449, 1001)

### For Combined Data w/ new Stopwords

In [338]:
#fill null values with 0
update1_combined_new_stopwords_df.fillna(0, inplace=True)

In [339]:
#create a new dataframe instance to prevent errors
update2_new_stopwords_combined_df = update1_combined_new_stopwords_df

#check number of rows and columns
update2_new_stopwords_combined_df.shape

(2000, 1001)

In [340]:
#dropping the duplicates just to make sure. It is not needed but better to be safe than sorry
update2_new_stopwords_combined_df.drop_duplicates(inplace=True)

#check for duplicates again
update2_new_stopwords_combined_df.duplicated().sum()

0

In [341]:
#see sample to make sure that the null values are replaced
update2_new_stopwords_combined_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [342]:
update2_new_stopwords_combined_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,subreddit
1993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1999,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [343]:
#check number of rows and columns
update2_new_stopwords_combined_df.shape

(1562, 1001)

### Export CSV to Data Folder to be used for prediction

In [344]:
#export both dataframes as CSV file to be used for modelling
update2_combined_df.to_csv('../data/Combined_DF_for_Naive_Bayes.csv', index=None, header=None)
update2_new_stopwords_combined_df.to_csv('../data/Combined_DF_new_stopwords_for_Naive_Bayes.csv', index=None, header=None)