## Imports

In [1]:
#Base
import pandas as pd
import numpy as np

#others

import itertools
import string
import re

#Visuals
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

#warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE

In [3]:
#nltk
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer, sent_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk import WordNetLemmatizer, pos_tag

In [4]:
#gensim
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess

In [5]:
#loading processed tweets csv
df = pd.read_csv('/Users/kellyjara/Desktop/Project_4/Data/Normalized_tweets.csv')
df

Unnamed: 0.1,Unnamed: 0,tweet_text,emotion,target,norm_tweet
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,1,iphone hr tweet dead need upgrade plugin station
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,0,know awesome ipadiphone app youll likely appre...
2,2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,0,wait also sale
3,3,@sxsw I hope this year's festival isn't as cra...,Negative emotion,1,hope year festival isnt crashy year iphone app
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,0,great stuff fri mayer google tim oreilly tech ...
...,...,...,...,...,...
3543,9077,@mention your PR guy just convinced me to swit...,Positive emotion,0,pr guy convince switch back iphone great coverage
3544,9079,&quot;papyrus...sort of like the ipad&quot; - ...,Positive emotion,0,quotpapyrussort ipadquot nice lol lavelle
3545,9080,Diller says Google TV &quot;might be run over ...,Negative emotion,1,diller say google tv quotmight run playstation...
3546,9085,I've always used Camera+ for my iPhone b/c it ...,Positive emotion,0,ive always use camera iphone bc image stabiliz...


In [6]:
#dropping columns that are not needed
df.drop(columns = ['Unnamed: 0', 'tweet_text','emotion'], inplace = True)

In [7]:
df

Unnamed: 0,target,norm_tweet
0,1,iphone hr tweet dead need upgrade plugin station
1,0,know awesome ipadiphone app youll likely appre...
2,0,wait also sale
3,1,hope year festival isnt crashy year iphone app
4,0,great stuff fri mayer google tim oreilly tech ...
...,...,...
3543,0,pr guy convince switch back iphone great coverage
3544,0,quotpapyrussort ipadquot nice lol lavelle
3545,1,diller say google tv quotmight run playstation...
3546,0,ive always use camera iphone bc image stabiliz...


## Vectorizing

In [8]:
#Train-Test-Split
X = df['norm_tweet']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [9]:
X_train

1184    panel quotstaying alive indie iphone game deve...
2820      help clean organize newscontent great job owner
532     give away ipad creator popular disc interactiv...
304     first ipad didnt even exist last year already ...
3433          get crave mind create app foodspotting link
                              ...                        
3052    true rt amp quotgoogle tweetquot new quotthink...
1255    mayer come sans intro still get cheer launch g...
774     doesnt make sense limit content specific platf...
876     geek interactive panel ipad holler gram app us...
30      false alarm google circle come nowand probably...
Name: norm_tweet, Length: 2838, dtype: object

### CountVectorizer

In [10]:
cv = CountVectorizer()
cv_X_train = cv.fit_transform(X_train)
cv_X_test = cv.fit_transform(X_test)
cv.get_feature_names_out()

array(['abroad', 'absolute', 'abt', ..., 'zagg', 'zip', 'zoom'],
      dtype=object)

#### Topic Modeling

In [11]:
cv_top_mod = NMF(n_components = 5)
cv_top_mod.fit(cv_X_train)

In [12]:
H = cv_top_mod.transform(cv_X_train)
W = cv_top_mod.components_

print("Shape of W is" + str(W.shape))
print("Shape of H is" + str(H.shape))
print("Shape of train is" + str(cv_X_train.shape))

Shape of W is(5, 4232)
Shape of H is(2838, 5)
Shape of train is(2838, 4232)


In [13]:
# 2,838 tweets in this train set
# 4,149 features

In [14]:
X_train

1184    panel quotstaying alive indie iphone game deve...
2820      help clean organize newscontent great job owner
532     give away ipad creator popular disc interactiv...
304     first ipad didnt even exist last year already ...
3433          get crave mind create app foodspotting link
                              ...                        
3052    true rt amp quotgoogle tweetquot new quotthink...
1255    mayer come sans intro still get cheer launch g...
774     doesnt make sense limit content specific platf...
876     geek interactive panel ipad holler gram app us...
30      false alarm google circle come nowand probably...
Name: norm_tweet, Length: 2838, dtype: object

In [21]:
texts = df['norm_tweet'].tolist()

token_text = [word_tokenize(text) for text in texts]

id2word = corpora.Dictionary(token_text)

corpus = [id2word.doc2bow(text) for text in token_text]

print(corpus[:1])
print(id2word)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]]
Dictionary<4702 unique tokens: ['dead', 'hr', 'iphone', 'need', 'plugin']...>


In [28]:
#creating object for LDA model using gensim
Lda = gensim.models.ldamodel.LdaModel

In [31]:
#running & training lda model on corpus

ldamodel = Lda(corpus, num_topics = 2, id2word = id2word, passes =1, random_state = 0, eval_every = None)

In [33]:
ldamodel.print_topics()

[(0,
  '0.043*"rt" + 0.036*"link" + 0.029*"ipad" + 0.027*"google" + 0.023*"iphone" + 0.016*"app" + 0.016*"new" + 0.011*"get" + 0.010*"apple" + 0.008*"store"'),
 (1,
  '0.035*"apple" + 0.034*"link" + 0.033*"rt" + 0.025*"ipad" + 0.024*"store" + 0.014*"austin" + 0.013*"google" + 0.012*"popup" + 0.010*"open" + 0.008*"come"')]

#### Logistic Regression

### TfidfVectorizer

In [None]:
vec = TfidfVectorizer()
vec.fit(X_train)

X_train_vec = vec.transform(X_train)
X_test_vec = vec.transform(X_test)

vec.get_feature_names_out()