In [1]:
# importing necessary library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
%matplotlib inline

In [2]:
# loading the dataset
data = pd.read_csv("Dataset/training_data.csv")

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
data.shape

(31962, 3)

In [5]:
data.size

95886

In [6]:
data.columns

Index(['id', 'label', 'tweet'], dtype='object')

In [7]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [8]:
data['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

# DATA PREPROCESSING

# Cleaning the data

In [9]:
# removing handle names
def remove_handle(tweet):
    match = re.findall("@[\w]*",tweet)
    for i in match:
        tweet = re.sub(i,'',tweet)
    return tweet

In [10]:
vector = np.vectorize(remove_handle)

In [22]:
data['tweets without handle'] = vector(data['tweet'])

In [12]:
data.head()

Unnamed: 0,id,label,tweet,tweets without handle
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation


# Removing punctuation's,numbers and symbols

In [13]:
data['tweets without handle'] = data['tweets without handle'].str.replace("[^a-zA-Z#]"," ")

In [14]:
data.head()

Unnamed: 0,id,label,tweet,tweets without handle
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now #motivation


In [15]:
data.tail()

Unnamed: 0,id,label,tweet,tweets without handle
31957,31958,0,ate @user isz that youuu?ðððððð...,ate isz that youuu ...
31958,31959,0,to see nina turner on the airwaves trying to...,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...",#sikh #temple vandalised in in #calgary #wso...
31961,31962,0,thank you @user for you follow,thank you for you follow


In [16]:
# REMOVING SHORT WORDS
data['tweets without handle']= data['tweets without handle'].str.lower()
data['tweets without handle'] = data['tweets without handle'].apply(lambda x: ' '.join([word for word in str(x).split() if len(word)>3]))

In [17]:
#tokenize the words
tokenized_tweets = data['tweets without handle'].apply(lambda x: x.split())
tokenized_tweets.head()

0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cause, they, offer, wh...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: tweets without handle, dtype: object

In [18]:
from nltk import PorterStemmer
ps = PorterStemmer()
tokenized_tweets = tokenized_tweets.apply(lambda x : [ps.stem(word) for word in x])
tokenized_tweets.head()

0    [when, father, dysfunct, selfish, drag, kid, i...
1    [thank, #lyft, credit, caus, they, offer, whee...
2                              [bihday, your, majesti]
3                     [#model, love, take, with, time]
4                         [factsguid, societi, #motiv]
Name: tweets without handle, dtype: object

In [19]:
for i in range(len(tokenized_tweets)):
    tokenized_tweets[i] = ' '.join(tokenized_tweets[i])
data['tweets without handle'] = tokenized_tweets
data.head()

Unnamed: 0,id,label,tweet,tweets without handle
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunct selfish drag kid into dys...
1,2,0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit caus they offer wheelchair ...
2,3,0,bihday your majesty,bihday your majesti
3,4,0,#model i love u take with u all the time in ...,#model love take with time
4,5,0,factsguide: society now #motivation,factsguid societi #motiv


In [20]:
data.tail()

Unnamed: 0,id,label,tweet,tweets without handle
31957,31958,0,ate @user isz that youuu?ðððððð...,that youuu
31958,31959,0,to see nina turner on the airwaves trying to...,nina turner airwav tri wrap herself mantl genu...
31959,31960,0,listening to sad songs on a monday morning otw...,listen song monday morn work
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,...",#sikh #templ vandalis #calgari #wso condemn
31961,31962,0,thank you @user for you follow,thank follow


In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
vectorizer_bow = CountVectorizer(max_features=6000,stop_words='english',ngram_range=(1,4))

In [25]:
x_bow = vectorizer_bow.fit_transform(data['tweets without handle']).toarray()

In [26]:
x_bow.shape

(31962, 6000)

In [27]:
y_bow = data['label']

In [28]:
y_bow.shape

(31962,)

In [30]:
from sklearn.model_selection import train_test_split
xtrain_bow,xtest_bow,ytrain_bow,ytest_bow = train_test_split(x_bow,y_bow,test_size=0.2,random_state=3)

In [31]:
#checkinf featires names
vectorizer_bow.get_feature_names()

['00',
 '00 shop',
 '00 shop cool',
 '00 shop cool home',
 '000',
 '039',
 '05',
 '06',
 '06 16',
 '06 16 cute',
 '08',
 '10',
 '10 days',
 '10 years',
 '100',
 '100 amazing',
 '100 amazing health',
 '100 amazing health benefits',
 '1000',
 '11',
 '11th',
 '12',
 '13',
 '13th',
 '14',
 '14th',
 '15',
 '16',
 '16 cute',
 '17',
 '18',
 '19',
 '1gabba',
 '1gabba vk',
 '1st',
 '1st time',
 '20',
 '20 speakers',
 '20 speakers free',
 '20 speakers free summit',
 '200',
 '2008',
 '2014',
 '2015',
 '2016',
 '2016 30',
 '2016 30 photos',
 '2016 30 photos buy',
 '2016in4words',
 '2016in4wordsâ',
 '2017',
 '20th',
 '21',
 '21st',
 '22',
 '23',
 '24',
 '24 hours',
 '25',
 '26',
 '26th',
 '27',
 '28',
 '29',
 '2b',
 '2day',
 '2nd',
 '2nd bihday',
 '2pm',
 '30',
 '30 photos',
 '30 photos buy',
 '30 photos buy things',
 '31',
 '32',
 '35',
 '36',
 '3d',
 '3d really',
 '3d really think',
 '3d really think head',
 '3rd',
 '40',
 '40404',
 '42',
 '45',
 '46',
 '48',
 '49',
 '4pm',
 '4th',
 '50',
 '50 de

In [32]:
vectorizer_bow.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 6000,
 'min_df': 1,
 'ngram_range': (1, 4),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [33]:
count_df = pd.DataFrame(x_bow,columns = vectorizer_bow.get_feature_names())

In [34]:
count_df.head()

Unnamed: 0,00,00 shop,00 shop cool,00 shop cool home,000,039,05,06,06 16,06 16 cute,...,¾ð,¾ð ¾ð,à¹,à¹ à¹,ï¼,ï¼ ï¼,ó¾,ó¾ ó¾,ó¾ ó¾ ó¾,ó¾ ó¾ ó¾ ó¾
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_tdif = TfidfVectorizer(max_features=6000,stop_words='english',ngram_range=(1,4))

In [41]:
x_tdif = vectorizer_tdif.fit_transform(data['tweets without handle'])

In [42]:
x_tdif = x_tdif.todense()
x_tdif

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [44]:
y_tdif=data['label']

In [45]:
y_tdif

0        0
1        0
2        0
3        0
4        0
        ..
31957    0
31958    0
31959    0
31960    1
31961    0
Name: label, Length: 31962, dtype: int64

In [46]:
from sklearn.model_selection import train_test_split
xtrain_tdif,xtest_tdif,ytrain_tdif,ytest_tdif = train_test_split(x_tdif,y_tdif,test_size=0.2,random_state=3)

In [47]:
xtrain_tdif

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [48]:
xtrain_tdif.shape

(25569, 6000)

In [49]:
ytrain_tdif.shape

(25569,)

# DATA VISUALIZATION

In [50]:
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image
import urllib
import requests

In [None]:
real =' '.join(text for text in data1['clean text'][data1['label']==0])