# Project 5
---
*By Ihza Gonzales*

## Libraries Used
---

In [1]:
import numpy as np
import pandas as pd

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import regex as re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Data Cleaning
---

In [2]:
anx = pd.read_csv('anx_submissions.csv')
writ = pd.read_csv('writing_submissions.csv')

In [3]:
anx.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,removed_by_category,author_flair_template_id,author_flair_text_color,author_flair_background_color,edited,author_cakeday,is_created_from_ads_ui,author_is_blocked,distinguished,banned_by
0,[],False,JackW357,,[],,text,t2_86tvd1p6,False,False,...,,,,,,,,,,
1,[],False,belladoll1021,,[],,text,t2_3imzzz6p,False,False,...,,,,,,,,,,
2,[],False,ashwinderegg,,[],,text,t2_3o3tfuf3,False,False,...,,,,,,,,,,
3,[],False,ashwinderegg,,[],,text,t2_3o3tfuf3,False,False,...,,,,,,,,,,
4,[],False,lachapoxxx,,[],,text,t2_93gbsj7i,False,False,...,,,,,,,,,,


In [4]:
writ.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,distinguished,media,media_embed,secure_media,secure_media_embed,author_flair_background_color,edited,banned_by,is_created_from_ads_ui,author_is_blocked
0,[],False,Yersenie,,[],,text,t2_9jnd5n8o,False,False,...,,,,,,,,,,
1,[],False,hoe4hob1,,[],,text,t2_8wg687lh,False,False,...,,,,,,,,,,
2,[],False,Pagliacci_Baby,,[],,text,t2_7d3owwgj,False,False,...,,,,,,,,,,
3,[],False,Jp_web_agency,,[],,text,t2_7tw9syp3,False,False,...,,,,,,,,,,
4,[],False,Jp_web_agency,,[],,text,t2_7tw9syp3,False,False,...,,,,,,,,,,


### Merge Datasets

In [5]:
df = anx.append(writ)

In [6]:
df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,banned_by,thumbnail_height,thumbnail_width,url_overridden_by_dest,crosspost_parent,crosspost_parent_list,media,media_embed,secure_media,secure_media_embed
0,[],False,JackW357,,[],,text,t2_86tvd1p6,False,False,...,,,,,,,,,,
1,[],False,belladoll1021,,[],,text,t2_3imzzz6p,False,False,...,,,,,,,,,,
2,[],False,ashwinderegg,,[],,text,t2_3o3tfuf3,False,False,...,,,,,,,,,,
3,[],False,ashwinderegg,,[],,text,t2_3o3tfuf3,False,False,...,,,,,,,,,,
4,[],False,lachapoxxx,,[],,text,t2_93gbsj7i,False,False,...,,,,,,,,,,


In [7]:
df = df[['author', 'link_flair_text', 'num_comments', 'subreddit', 'selftext', 'title', 'created_utc']]

In [8]:
df.head()

Unnamed: 0,author,link_flair_text,num_comments,subreddit,selftext,title,created_utc
0,JackW357,DAE Questions,9,Anxiety,,Anyone else scared of dying and scared of when...,1606687976
1,belladoll1021,Health,1,Anxiety,Can a tight throat and gagging feeling be anxi...,Tight throat,1606687615
2,ashwinderegg,Advice Needed,3,Anxiety,Does anyone else feel like they can no longer ...,Anxiety overriding my intuition.,1606687588
3,ashwinderegg,Advice Needed,7,Anxiety,Does anyone else feel like they can no longer ...,Anxiety overriding my intuition.,1606687588
4,lachapoxxx,Advice Needed,1,Anxiety,my anxiety has been going crazy this past week...,hey friends! i need some advice,1606687488


### Clean Dataset

In [9]:
df.isnull().sum()

author                0
link_flair_text    1953
num_comments          0
subreddit             0
selftext            322
title                 0
created_utc           0
dtype: int64

In [10]:
df.fillna(' ', inplace = True)

In [11]:
df.isnull().sum()

author             0
link_flair_text    0
num_comments       0
subreddit          0
selftext           0
title              0
created_utc        0
dtype: int64

In [12]:
df[df['selftext'] == '[removed]'].count()

author             949
link_flair_text    949
num_comments       949
subreddit          949
selftext           949
title              949
created_utc        949
dtype: int64

In [13]:
df[df['selftext'] == '[deleted]'].count()

author             56
link_flair_text    56
num_comments       56
subreddit          56
selftext           56
title              56
created_utc        56
dtype: int64

In [14]:
df.replace({'[removed]':' ', '[deleted]':' '}, inplace = True)

In [15]:
df['text'] = df['title'] + ' ' + df['selftext']

In [16]:
df.head(3)

Unnamed: 0,author,link_flair_text,num_comments,subreddit,selftext,title,created_utc,text
0,JackW357,DAE Questions,9,Anxiety,,Anyone else scared of dying and scared of when...,1606687976,Anyone else scared of dying and scared of when...
1,belladoll1021,Health,1,Anxiety,Can a tight throat and gagging feeling be anxi...,Tight throat,1606687615,Tight throat Can a tight throat and gagging fe...
2,ashwinderegg,Advice Needed,3,Anxiety,Does anyone else feel like they can no longer ...,Anxiety overriding my intuition.,1606687588,Anxiety overriding my intuition. Does anyone e...


In [17]:
df.drop(columns = ['selftext', 'title'], inplace = True)

In [18]:
df.head()

Unnamed: 0,author,link_flair_text,num_comments,subreddit,created_utc,text
0,JackW357,DAE Questions,9,Anxiety,1606687976,Anyone else scared of dying and scared of when...
1,belladoll1021,Health,1,Anxiety,1606687615,Tight throat Can a tight throat and gagging fe...
2,ashwinderegg,Advice Needed,3,Anxiety,1606687588,Anxiety overriding my intuition. Does anyone e...
3,ashwinderegg,Advice Needed,7,Anxiety,1606687588,Anxiety overriding my intuition. Does anyone e...
4,lachapoxxx,Advice Needed,1,Anxiety,1606687488,hey friends! i need some advice my anxiety has...


In [19]:
df.to_csv('anx_writing.csv')

## Custom Processor

In [20]:
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

In [32]:
rows = [row for row in df['text']]
text =  ' '.join(rows)

In [35]:
p = re.compile(\b([a-z])\1\1\1.+?\b, re.IGNORECASE)
m = p.findall(text)        

SyntaxError: unexpected character after line continuation character (<ipython-input-35-1fc159d5337c>, line 1)

In [46]:
regex = "\b([a-z])\1.+?\b"

matches = re.findall(regex, text, re.MULTILINE | re.IGNORECASE)
matches

[]

In [None]:
def scrub_words(text):
    """Basic cleaning of texts."""
    
    # remove html markup
    text=re.sub("(<.*?>)","",text)
    
    #remove non-ascii and digits
    text=re.sub("(\\W|\\d)"," ",text)
    
    #remove whitespace
    text=text.strip()
    return text

In [34]:
m

[]

In [20]:
def custom_preprocessor(text):
    text = text.lower() #lowercases word
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE) #remove url
    text = re.sub("[^a-zA-Z]", "",str(text)) #text only
    
    #copied from https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
   
    
    return text

#copied from https://www.studytonight.com/post/scikitlearn-countvectorizer-in-nlp

In [22]:
df['subreddit'].value_counts(normalize = True)

Anxiety    0.5
writing    0.5
Name: subreddit, dtype: float64

In [23]:
df.dtypes

author             object
link_flair_text    object
num_comments        int64
subreddit          object
created_utc         int64
text               object
dtype: object

## Logistic Regression

In [24]:
X = df['text']
y = df['subreddit']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

In [31]:
cvec = CountVectorizer(stop_words = 'english', 
                       strip_accents = 'ascii', 
                       ngram_range = (1, 2),
                       preprocessor = custom_preprocessor,
                       max_features = 50000)   

X_train_cv = cvec.fit_transform(X_train)

X_test_cv = cvec.transform(X_test)

TypeError: expected string or bytes-like object