## FEATURE ENGINEERING

In [35]:
# import libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


# text processing libraries
import re
import contractions

from collections import Counter
# import string
import nltk
# import warnings
# %matplotlib inline
# warnings.filterwarnings("ignore")
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud

from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\macie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\macie\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [36]:
# import data   
tweets = pd.read_csv("..//data//tweets.csv", encoding="latin-1")

In [37]:
x_train_valid, x_test, y_train_valid, y_test = train_test_split(
    tweets.drop(columns=['Target']), # X
    tweets['Target'], # y
    test_size=0.3, random_state=42)
x_train_valid.shape, y_train_valid.shape, x_test.shape, y_test.shape 

((734002, 5), (734002,), (314573, 5), (314573,))

In [38]:
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_valid, # X
    y_train_valid, # y
    test_size=0.3, random_state=42)
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

((513801, 5), (513801,), (220201, 5), (220201,))

In [39]:
x_train = x_train.drop(['ID', 'Date', 'flag', 'User'], axis = 'columns')
# removing unnecessary user tags
x_train['Text'] = x_train['Text'].replace(r"@\w+", "", regex=True)
# resolving contractions (and slang)
x_train['Text'] = x_train['Text'].apply(lambda x: contractions.fix(x))
# removing punctuation marks
x_train['Text'] = x_train['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
# deleting websites
x_train['Text'] = x_train['Text'].apply(lambda x: re.sub(r'http\S+', '', x))
# lowercasing letters in the text
x_train['Text'] = x_train['Text'].str.lower()
#x_train['Text'] = x_train['Text'].apply(lambda x: " ".join([w for w in x.split() if len(w) >= 2]))
# individual words considered as tokens
tokenized_tweet = x_train['Text'].apply(lambda x: x.split())
# Initialize wordnet lemmatizer only on verbs - makes the biggest sense
wnl = WordNetLemmatizer()
tokenized_tweet = tokenized_tweet.apply(lambda s: [wnl.lemmatize(word, pos='v') for word in s])
tokenized_tweet = tokenized_tweet.apply(lambda s: [wnl.lemmatize(word, pos='n') for word in s])
tokenized_tweet = tokenized_tweet.apply(lambda s: [wnl.lemmatize(word, pos='a') for word in s])
tokenized_tweet = tokenized_tweet.apply(lambda s: [wnl.lemmatize(word, pos='r') for word in s])
# combining to sentences
combined_sentences = [' '.join(tokens) for tokens in tokenized_tweet]
x_train['combined_tweet'] = combined_sentences
x_train


Unnamed: 0,Text,combined_tweet
810677,working listening to kmps happy my boss amp th...,work listen to kmps happy my bos amp the mecha...
684982,we both know she is late for quotat 11quot,we both know she be late for quotat 11quot
954730,do not bogart that joint my friend,do not bogart that joint my friend
142760,i am so sick of being sick i do not want to mi...,i be so sick of be sick i do not want to miss ...
200331,i am pretty sure i went to bed about 2 hours a...,i be pretty sure i go to bed about 2 hour ago ...
...,...,...
1023596,good morning sunshine,good morning sunshine
281932,is the website down i cannot get it to load,be the website down i cannot get it to load
828817,i listened to the itunes samplesmy fav albums...,i listen to the itunes samplesmy fav album be ...
683555,i use palringo for chat and fring for voice c...,i use palringo for chat and fring for voice ca...


In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [40]:
bow_vectorizer = CountVectorizer(max_df = 0.95, min_df = 10, max_features = 1000, stop_words='english')
bow = bow_vectorizer.fit_transform(x_train['combined_tweet'])
x_train = bow

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

In [41]:
#training
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

LogisticRegression(max_iter=1000)

In [42]:
x_valid = x_valid.drop(['ID', 'Date', 'flag', 'User'], axis = 'columns')
# removing unnecessary user tags
x_valid['Text'] = x_valid['Text'].replace(r"@\w+", "", regex=True)
# resolving contractions (and slang)
x_valid['Text'] = x_valid['Text'].apply(lambda x: contractions.fix(x))
# removing punctuation marks
x_valid['Text'] = x_valid['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
# deleting websites
x_valid['Text'] = x_valid['Text'].apply(lambda x: re.sub(r'http\S+', '', x))
# lowercasing letters in the text
x_valid['Text'] = x_valid['Text'].str.lower()
#x_valid['Text'] = x_valid['Text'].apply(lambda x: " ".join([w for w in x.split() if len(w) >= 2]))
# individual words considered as tokens
tokenized_tweet = x_valid['Text'].apply(lambda x: x.split())
# Initialize wordnet lemmatizer only on verbs - makes the biggest sense
wnl = WordNetLemmatizer()
tokenized_tweet = tokenized_tweet.apply(lambda s: [wnl.lemmatize(word, pos='v') for word in s])
tokenized_tweet = tokenized_tweet.apply(lambda s: [wnl.lemmatize(word, pos='n') for word in s])
tokenized_tweet = tokenized_tweet.apply(lambda s: [wnl.lemmatize(word, pos='a') for word in s])
tokenized_tweet = tokenized_tweet.apply(lambda s: [wnl.lemmatize(word, pos='r') for word in s])
# combining to sentences
combined_sentences = [' '.join(tokens) for tokens in tokenized_tweet]
x_valid['combined_tweet'] = combined_sentences
x_valid

Unnamed: 0,Text,combined_tweet
240689,tierd and it is school tomorrow last week atl...,tierd and it be school tomorrow last week atleast
413003,twitter gets boring n boring everydayno star w...,twitter get bore n bore everydayno star want 2...
950284,i am watching guy ripley right nowhahahilarious,i be watch guy ripley right nowhahahilarious
672298,that is the way indoor stadium toilets are,that be the way indoor stadium toilet be
852721,it must be all that bike riding,it must be all that bike rid
...,...,...
55759,wantd 2b comedian when lil boy i memrize comm...,wantd 2b comedian when lil boy i memrize comme...
175608,omg i cannot believe jay leno is going off the...,omg i cannot believe jay leno be go off the air
661283,i do not know my days are all messed up since...,i do not know my day be all mess up since i ge...
43369,so i am guessin meant midnight pacific time,so i be guessin mean midnight pacific time


In [43]:
bow_vectorizer = CountVectorizer(max_df=0.95, min_df=10, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(x_valid['combined_tweet'])
x_valid = bow

In [44]:
#testing
pred=  model.predict(x_valid)

In [45]:
from sklearn.metrics import f1_score
f1_score(y_valid, pred, pos_label=4)

0.10305283879636283

In [46]:
accuracy_score(y_valid,pred)

0.7392836544793167