In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import LabelEncoder


In [3]:
df=pd.read_csv("training.csv",encoding = "ISO-8859-1", low_memory=False,
               names=["target",
                      'id',
                      'date',
                      'flag',
                      'user',
                      'text'])

In [4]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Explanatory data analysis

In [5]:
df.shape

(1600000, 6)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [7]:
sample=df.text[0]
sample

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

tokenize the generated sample and return the first 10 tokens

In [8]:
import nltk
from nltk import word_tokenize
#tokenize the sample
sample_tokens=word_tokenize(sample)
sample_tokens[:10]


['@',
 'switchfoot',
 'http',
 ':',
 '//twitpic.com/2y1zl',
 '-',
 'Awww',
 ',',
 'that',
 "'s"]

In [9]:
#lets implement frequency distributiob
from nltk import FreqDist
sample_freqdist=FreqDist(sample_tokens)
sample_freqdist.most_common(10)

[('.', 2),
 ('@', 1),
 ('switchfoot', 1),
 ('http', 1),
 (':', 1),
 ('//twitpic.com/2y1zl', 1),
 ('-', 1),
 ('Awww', 1),
 (',', 1),
 ('that', 1)]

A Document-Term Martix is a matrix that represents the frequency of terms that occur in a collection of documents

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
def create_dtm(series):
    cv=CountVectorizer()
    #create a DTM for the given series
    dtm=cv.fit_transform(series)
    #convert the sparse array to dense array
    dtm=dtm.todense()
    #Get the column names
    features=cv.get_feature_names_out()
    #create a dataframe
    dtm_df=pd.DataFrame(dtm,columns=features)
    #return dataframe
    return dtm_df

create_dtm(df.text.head())

Unnamed: 0,2y1zl,50,all,also,am,and,as,at,awww,ball,...,third,times,to,today,twitpic,update,upset,whole,why,you
0,1,0,0,0,0,0,0,0,1,0,...,1,0,1,0,1,0,0,0,0,1
1,0,0,0,1,0,1,1,0,0,0,...,0,0,0,1,0,1,1,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,1,1,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1


In [19]:
#lets check on feature importance
from sklearn.linear_model import LogisticRegression

def positive_tweets(text,sentiment,n):
    #create an instance of a class
    lgr=LogisticRegression(solver='lbfgs',max_iter=2500,random_state=1234)
    cv=CountVectorizer()
    #Create dtm
    dtm=cv.fit_transform(text)
    #Fit the logistic regression model
    lgr.fit(dtm,sentiment)
    #Get the coefficients
    coefs=lgr.coef_[0]
    features=cv.get_feature_names_out()
    df=pd.DataFrame({'Tokens':features,'coefficients':coefs})
    #return the largest n
    return df.nlargest(n,'coefficients')

positive_tweets(df.text,df.target,5)

In [10]:
def complexity(string):
    total_tokens=word_tokenize(string)
    #Create a set of all tokens(which only keeps unique values)
    unique_tokens=set(word_tokenize(string))
   # return complexity measure
    if len(total_tokens)==0:
        return 0
    else:
        return len(unique_tokens)/len(total_tokens)

df.text.head(10).apply(complexity)

0    0.962963
1    1.000000
2    1.000000
3    1.000000
4    0.866667
5    1.000000
6    1.000000
7    0.857143
8    1.000000
9    1.000000
Name: text, dtype: float64

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anitah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:

from nltk.corpus import stopwords
def stopword_remover(string):
    tokens=word_tokenize(string)
    english_stopwords=stopwords.words('english')
    #return non-stop words
    return [W for W in tokens if W.lower() not in english_stopwords]
df.text.head(5).apply(stopword_remover)

0    [@, switchfoot, http, :, //twitpic.com/2y1zl, ...
1    [upset, ca, n't, update, Facebook, texting, .....
2    [@, Kenichan, dived, many, times, ball, ., Man...
3              [whole, body, feels, itchy, like, fire]
4    [@, nationwideclass, ,, 's, behaving, ., 'm, m...
Name: text, dtype: object

In [13]:
def stopword_nonalpha_remover(string):
    return [x for x in stopword_remover(string) if x.isalpha()]
df.text.head().apply(stopword_nonalpha_remover)


0    [switchfoot, http, Awww, bummer, shoulda, got,...
1    [upset, ca, update, Facebook, texting, might, ...
2    [Kenichan, dived, many, times, ball, Managed, ...
3              [whole, body, feels, itchy, like, fire]
4            [nationwideclass, behaving, mad, ca, see]
Name: text, dtype: object

In [14]:
#text cleaning
def complexity_cleaned(series):
    return series.apply(lambda x: complexity(' '.join(stopword_nonalpha_remover(x))))
df['complexity']=complexity_cleaned(df.text)
df.sort_values('complexity',ascending=False).head(5)

KeyboardInterrupt: 