# Twitter Sentiment Analysis
[link](https://www.kaggle.com/code/paoloripamonti/twitter-sentiment-analysis/notebook)

In [4]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# # Keras
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.models import Sequential
# from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
# from keras import utils
# from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nickl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
from pathlib import Path
import seaborn as sns
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
df = pd.read_csv(r'../data/training.1600000.processed.noemoticon.csv.zip',encoding=DATASET_ENCODING,names=DATASET_COLUMNS)

In [12]:
df.sample(5)

Unnamed: 0,target,ids,date,flag,user,text
1174710,4,1980987772,Sun May 31 08:09:16 PDT 2009,NO_QUERY,shiningcher,Morning tweethearts
678316,0,2248939797,Fri Jun 19 21:26:38 PDT 2009,NO_QUERY,thsnicegrl,@eskimo1105 fml my dreams might be crushed th...
425356,0,2063308852,Sun Jun 07 02:08:38 PDT 2009,NO_QUERY,kharissa101,says crazykart lng. (tears) http://plurk.com/...
1541652,4,2180849449,Mon Jun 15 11:01:42 PDT 2009,NO_QUERY,tweeteradder2,@alainazieman Get 100 followers a day using ww...
1385747,4,2052773271,Sat Jun 06 01:58:13 PDT 2009,NO_QUERY,Faespace,"@GarethCliff yeah, it's good isn't it Poor fa..."


In [17]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]
df['target'] = df['target'].map(decode_sentiment)   

In [15]:
%%timeit
# v_1 apply
df_1=df.copy()
df_1['target'] = df_1['target'].map(decode_sentiment)

348 ms ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%time
# v_2 apply
df_2=df.copy()
df_2.target = df_2.target.apply(lambda x: decode_sentiment(x))

CPU times: total: 438 ms
Wall time: 447 ms


In [18]:
df.target.T


Unnamed: 0,target,ids,date,flag,user,text
0,NEGATIVE,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,NEGATIVE,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,NEGATIVE,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,NEGATIVE,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,NEGATIVE,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,POSITIVE,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,POSITIVE,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,POSITIVE,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,POSITIVE,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
