In [1]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import FastText
from gensim.models.fasttext import load_facebook_model
from ydata_profiling import ProfileReport
import plotly.express as px
import numpy as np
import re
import random
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
nltk.download('punkt') 
nltk.download('stopwords')   
nltk.download('wordnet')  
nltk.download('punkt', download_dir='/kaggle/working/nltk_data/')
nltk.download('stopwords', download_dir='/kaggle/working/nltk_data/')
nltk.download('wordnet', download_dir='/kaggle/working/nltk_data/')
nltk.data.path.append('/kaggle/working/nltk_data/')
import warnings
warnings.filterwarnings('ignore')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /kaggle/working/nltk_data/...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /kaggle/working/nltk_data/...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data/...


In [2]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset_path="/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv"
df = pd.read_csv(dataset_path, encoding =DATASET_ENCODING , names=DATASET_COLUMNS)
df.drop(columns=["ids","date","flag"],inplace=True)

In [3]:
first_half = df.iloc[:80000]
second_half = df.iloc[80000:]

# Randomly sample 100 rows from each half
sample_first_half = first_half.sample(100, random_state=1)  # random_state for reproducibility
sample_second_half = second_half.sample(100, random_state=1)

# Concatenate the two samples into a new DataFrame
df = pd.concat([sample_first_half, sample_second_half])
df.head(10)

Unnamed: 0,target,user,text
7126,0,andrey_romanoff,@newslava welcome to the club
46624,0,rosiej,Thai place got my order wrong I want my noodl...
67020,0,devilmaysigh,tummy ache..
67356,0,hannahshu,"Hates when people, aka my sisters, eat my ice ..."
61688,0,carol_13,To doente
45869,0,smartestgames,Someone in UNITED STATES didn't like Globs htt...
42565,0,lunarlie,@tinyvamp i prefer degrassi...but since noggin...
45818,0,rich_wallace,couldn't break 6m 33s on survival - the light ...
21617,0,chrisong,"Argh, the speakers on my Macbook are screwing..."
41746,0,ScorpioRisingTX,Bad newz Wonderbred is very sick! We had to c...


In [4]:
# Plot distribution of sentiment classes
fig = px.bar(df['target'].value_counts(), title='Sentiment Distribution',
             labels={'value': 'Number of Tweets', 'index': 'Sentiment'}, 
             text_auto=True,
             template='plotly_dark')

fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)'
)

fig.show()

In [5]:
# Prepare data for pie chart
sentiment_counts = df['target'].value_counts().reset_index()
sentiment_counts.columns = ['sentiment', 'count']
sentiment_counts['sentiment'] = sentiment_counts['sentiment'].map({1: 'Positive', 0: 'Negative'})

# Plot pie chart
fig = px.pie(sentiment_counts, values='count', names='sentiment', title='Sentiment Distribution',
             color='sentiment', color_discrete_map={'Positive':'#00CC96', 'Negative':'#EF553B'},
             template='plotly_dark')

fig.update_traces(textposition='inside', textinfo='percent+label')

fig.show()

In [6]:
df["target"]=df["target"].map({4:1,0:0})
df["target"].value_counts()

target
0    158
1     42
Name: count, dtype: int64

In [10]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https?://\S+|#\S+|@\S+|[0-9]+|\W", " ", text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)
df.head()

Unnamed: 0,target,user,text,cleaned_text
7126,0,andrey_romanoff,@newslava welcome to the club,welcome club
46624,0,rosiej,Thai place got my order wrong I want my noodl...,thai place got order wrong want noodles
67020,0,devilmaysigh,tummy ache..,tummy ache
67356,0,hannahshu,"Hates when people, aka my sisters, eat my ice ...",hates people aka sisters eat ice cream home en...
61688,0,carol_13,To doente,doente


In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = 10000  # Defines the maximum number of unique words
max_length = 100    # Defines the maximum length of the sequences
trunc_type = 'post' # Truncates the sequences from the end if they exceed max_length
padding_type = 'post' # Pads sequences at the end

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])  # Fit it to the texts

# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Now padded_sequences is ready for use in a CNN
print(padded_sequences)

2024-04-30 12:17:04.089731: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-30 12:17:04.089880: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-30 12:17:04.289059: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[[101 228   0 ...   0   0   0]
 [229  42  17 ...   0   0   0]
 [231 232   0 ...   0   0   0]
 ...
 [ 12 209  12 ...   0   0   0]
 [117 101  86 ...   0   0   0]
 [  5  80  77 ...   0   0   0]]
