In [1]:
#https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/
#https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial
import pandas as pd
pd.set_option("display.max_columns", 500)
import numpy as np
np.set_printoptions(threshold=np.nan)

In [2]:
import os

In [3]:
train=pd.read_csv(os.path.join("dataset","train_E6oV3lV.csv"))
#train['tweet'].head()

In [4]:
from nltk.corpus import stopwords
stop=stopwords.words('english')
train['stopwords']=train['tweet'].apply(lambda x: ' '.join(word for word in x.split() if word in stop))
train[['tweet','stopwords']].head()

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,when a is and is so he his into his
1,@user @user thanks for #lyft credit i can't us...,for i they don't in
2,bihday your majesty,your
3,#model i love u take with u all the time in ...,i with all the in
4,factsguide: society now #motivation,now


In [5]:
train['tweet']=train['tweet'].str.replace('[^\w\s]','')    #remove any starting word characters and whitespace 
train['tweet'].head()       #\d, \w, \s shorthand character classes matching digits, word characters, and whitespace.

0     user when a father is dysfunctional and is so...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model   i love u take with u all the time in u...
4                 factsguide society now    motivation
Name: tweet, dtype: object

In [6]:
#remove stop words
from nltk.corpus import stopwords
stop=stopwords.words('english')
train['tweet']=train['tweet'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop))

In [None]:
#find 50 most common words
haha=pd.Series(' '.join(train['tweet']).split()).value_counts()[:50]
haha.index

In [None]:
#most common 50 words
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import matplotlib
import seaborn as sns

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x=haha.index, 
            y=haha.values)
plt.xticks(rotation=90)

font = {'weight' : 'bold',
        'size'   : 10}
matplotlib.rc('font', **font)
plt.tight_layout

In [None]:
#wordCloud
from wordcloud import WordCloud, STOPWORDS

plt.figure(figsize=(16,13))
wc = WordCloud(background_color="white", max_words=10000, 
                stopwords=STOPWORDS, max_font_size= 40)
wc.generate("tweet wordcloud".join(train['tweet']))
plt.title("", fontsize=20)

plt.imshow(wc.recolor( colormap= 'Pastel2' , random_state=17))
plt.axis('off')

In [None]:
#tokenize 
from nltk import word_tokenize
train['tokenized_text'] = train["tweet"].apply(lambda row: word_tokenize(row))
train[['tweet','tokenized_text']].head()

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
train['tweet_lemm']=train['tweet'].apply(lambda x: " ".join(lemmatizer.lemmatize(w) for w in x.split()))
#train['tweet_lemm']=train['tokenized_text'].apply(lambda x: " ".join(lemmatizer.lemmatize(w) for w in x)) 
train['tweet_lemm'].head()

In [None]:
from nltk import bigrams
string = "I really like python, it's pretty awesome."
string_bigrams = bigrams(string.split())
for grams in string_bigrams:
    print(grams)   #what do we do with bigrams or n grams?

In [7]:
#bag of words
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv=CountVectorizer(max_features=1000, lowercase=True, stop_words='english')
train_bow=cv.fit_transform(train['tweet'])

In [9]:
#Sentiment Analyss
from textblob import TextBlob
train['sentiment']=train['tweet'].apply(lambda x: TextBlob(x).sentiment[0])
train[['tweet','sentiment']].head()

Unnamed: 0,tweet,sentiment
0,user father dysfunctional selfish drags kids d...,-0.5
1,user user thanks lyft credit cant use cause do...,0.2
2,bihday majesty,0.0
3,model love u take u time urð ðððð ððð,0.5
4,factsguide society motivation,0.0


In [10]:
#Topic modeling
from sklearn.decomposition import LatentDirichletAllocation
lda=LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=25, random_state=0)
document_topics=lda.fit_transform(train_bow)
lda.components_.shape

(10, 1000)

In [11]:
sorting=np.argsort(lda.components_, axis=1)[:,::-1]
feature_names=np.array(cv.get_feature_names())
import mglearn

In [12]:
mglearn.tools.print_topics(topics=range(10),feature_names=feature_names,sorting=sorting,topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
user          day           people        love          happy         
amp           fathers       need          time          smile         
positive      good          friday        ððð           weekend       
make          great         makes         life          love          
affirmation   today         bear          want          summer        
i_am          morning       world         ðððð          friends       
thanks        tomorrow      polar         model         fun           
2016          fathersday    climb         bull          family        
ready         love          change        live          sunday        
getting       happy         city          urð           cute          


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
new 