In [86]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from nltk.tokenize import word_tokenize

In [87]:
df = pd.read_csv('sentiment_dataset_n.csv',encoding='latin-1')
df['val'] = df.iloc[:,1]
df['id'] = df.iloc[:,2]
df['timestamp'] = df.iloc[:,3]
df['query'] = df.iloc[:,4]
df['username'] = df.iloc[:,5]
df['tweet'] = df.iloc[:,6]

# Use loc instead of iloc to select columns by their names
df = df.loc[:, ['val', 'id', 'timestamp', 'query', 'username', 'tweet']]


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   val        1599999 non-null  int64 
 1   id         1599999 non-null  object
 2   timestamp  1599999 non-null  object
 3   query      1599999 non-null  object
 4   username   1599999 non-null  object
 5   tweet      1599999 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [89]:
df.describe()

Unnamed: 0,val,tweet
count,1599999.0,1599999.0
mean,1998818000.0,1998818000.0
std,193575700.0,193575700.0
min,1467811000.0,1467811000.0
25%,1956916000.0,1956916000.0
50%,2002102000.0,2002102000.0
75%,2177059000.0,2177059000.0
max,2329206000.0,2329206000.0


In [90]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Define the preprocess_text function with stopwords removal
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    # Remove mentions
    text = re.sub(r"@[A-Za-z0-9]+", "", text)
    # Remove non-alphabetic characters
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # Tokenize the text
    words = text.split()
    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]
    # Join the words back into a single string
    return " ".join(filtered_words)

# Apply preprocessing to the 'tweet' column using vectorization
vectorized_preprocess = np.vectorize(preprocess_text)
df['clean_tweet'] = vectorized_preprocess(df['tweet'])

# Display the first few rows of the DataFrame
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\golis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: 'numpy.int64' object has no attribute 'lower'

In [None]:
nltk.download('punkt')

def tokenize_text(text):
    return word_tokenize(text)

df['tokenized_tweet'] = df['clean_tweet'].apply(tokenize_text)

print(df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\golis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


          val                            id timestamp          query  \
0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY  scotthamilton   
1  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY       mattycus   
2  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY        ElleCTF   
3  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         Karoli   
4  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY       joy_wolf   

                                            username       tweet clean_tweet  \
0  is upset that he can't update his Facebook by ...  1467810672               
1  @Kenichan I dived many times for the ball. Man...  1467810917               
2    my whole body feels itchy and like its on fire   1467811184               
3  @nationwideclass no, it's not behaving at all....  1467811193               
4                      @Kwesidei not the whole crew   1467811372               

  tokenized_tweet  
0              []  
1              []  
2              []  
3     

In [None]:
from nltk.stem import PorterStemmer

# Initialize PorterStemmer
porter_stemmer = PorterStemmer()

# Stemming function
def stem_text(tokens):
    return [porter_stemmer.stem(token) for token in tokens]

# Apply stemming to the 'tokenized_tweet' column and create a new column 'stemmed_tweet'
df['stemmed_tweet'] = df['tokenized_tweet'].apply(stem_text)

# Display the first few rows of the DataFrame with the new 'stemmed_tweet' column
df.head()


Unnamed: 0,val,id,timestamp,query,username,tweet,clean_tweet,tokenized_tweet,stemmed_tweet
0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,1467810672,,[],[]
1,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,1467810917,,[],[]
2,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,1467811184,,[],[]
3,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",1467811193,,[],[]
4,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,1467811372,,[],[]


In [None]:
df['val'].value_counts()

2190457769    2
1974742852    2
2062516845    2
1551586713    2
1563681287    2
             ..
2197311343    1
2197311196    1
2197311146    1
2197310899    1
2193602129    1
Name: val, Length: 1598314, dtype: int64

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import numpy as np
import requests
import matplotlib.pyplot as plt

# Storing all the non-sexist/racist words
positive_words = ' '.join(text for text in df['clean_tweet'][df['val'] == 0])

# Generating the Twitter logo mask
url = 'http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png'
response = requests.get(url, stream=True)
mask = np.array(Image.open(response.raw))

# We will use the ImageColorGenerator to generate the color of the image
image_color = ImageColorGenerator(mask)

# Now we will use the WordCloud function of the wordcloud library
wc = WordCloud(background_color='black', height=1500, width=4000, mask=mask).generate(positive_words)

# Size of the image generated
plt.figure(figsize=(10, 20))

# Here we recolor the words from the dataset to the image's color
# Interpolation is used to smooth the image generated
plt.imshow(wc.recolor(color_func=image_color), interpolation="hamming")

plt.axis('off')
plt.show()


ValueError: We need at least 1 word to plot a word cloud, got 0.

In [None]:
# Storing the words with label '1' (negative words)
negative_words = ' '.join(text for text in df['clean_tweet'][df['val'] == 1])

# Generating the Twitter logo mask
url = 'http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png'
response = requests.get(url, stream=True)
mask = np.array(Image.open(response.raw))

# Using ImageColorGenerator to generate the color of the image
image_colors = ImageColorGenerator(mask)

# Now we use the WordCloud function from the wordcloud library 
wc = WordCloud(background_color='black', height=1500, width=4000, mask=mask).generate(negative_words)

# Size of the image generated 
plt.figure(figsize=(10, 20))

# Here we recolor the words from the dataset to the image's color
# Recolor just recolors the default colors to the image's blue color
# Interpolation is used to smooth the image generated 
plt.imshow(wc.recolor(color_func=image_colors), interpolation="gaussian")

plt.axis('off')
plt.show()


ValueError: We need at least 1 word to plot a word cloud, got 0.