In [None]:
import nltk
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
url = '/Users/keshavsaraogi/data/analysis.csv'
df = pd.read_csv(url, encoding='latin-1', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])

In [None]:
print(df.head())
print(df.info())
print(df.describe())

In [None]:
print(df.isnull().sum())

In [None]:
"Removing HTML tags, URLs, emojis, special characters, digits, and convert to lower case"

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().strip()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [None]:
print(df[['text', 'cleaned_text']].head())

In [None]:
def tokenizeAndRemoveStopWords(text):
    stopWords = set(stopwords.words('english'))
    words = word_tokenize(text)
    filteredWords = [word for word in words if word not in stopWords]
    return ''.join(filteredWords)

df['processed_text'] = df['cleaned_text'].apply(tokenizeAndRemoveStopWords)

In [None]:
vectorizer = TfidfVectorizer(max_features = 5000)
X = vectorizer.fit_transform(df['processed_text'])

In [None]:
"Sentiment140 target column has '0' for negative sentiment and '4' for positive sentiment"

y = np.where(df['target'] == 4, 1, 0)  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)