In [13]:
import pandas as pd

# Load your dataset
train_data = pd.read_csv('twitter_training.csv')
validation_data = pd.read_csv('twitter_validation.csv')


In [14]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
# Check the types of the values in the 'text' column
print(train_data['text'].apply(type).value_counts())

# Optionally, handle non-string values (convert to empty string or drop rows)
train_data['text'] = train_data['text'].astype(str)
validation_data['text'] = validation_data['text'].astype(str)
def preprocess(text):
    # Remove special characters and links
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing and removing stop words
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if not word in stopwords.words('english')]

    return ' '.join(tokens)

train_data['clean_text'] = train_data['text'].apply(preprocess)
validation_data['clean_text'] = validation_data['text'].apply(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


text
<class 'str'>      73995
<class 'float'>      686
Name: count, dtype: int64


In [15]:
print(train_data[['clean_text']].head())
print(validation_data[['clean_text']].head())



                      clean_text
0            coming borders kill
1    im getting borderlands kill
2   im coming borderlands murder
3  im getting borderlands murder
4  im getting borderlands murder
                                          clean_text
0  bbc news amazon boss jeff bezos rejects claims...
1               pay word functions poorly chromebook
2  csgo matchmaking full closet hacking truly awf...
3  president slapping americans face really commi...
4  hi madeleine mccann cellar past years little s...


In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features to 5000 for simplicity

# Fit and transform the training data
X_train = vectorizer.fit_transform(train_data['clean_text'])
y_train = train_data['type']

# Transform the validation data
X_valid = vectorizer.transform(validation_data['clean_text'])
y_valid = validation_data['type']





In [18]:
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increase iterations if needed

# Train the model
model.fit(X_train, y_train)


In [19]:
# Predict the sentiments on the validation set
y_pred = model.predict(X_valid)

# Evaluate the model
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
print(classification_report(y_valid, y_pred))


Accuracy: 0.81
              precision    recall  f1-score   support

  Irrelevant       0.82      0.68      0.75       171
    Negative       0.75      0.86      0.80       266
     Neutral       0.84      0.78      0.81       285
    Positive       0.83      0.86      0.84       277

    accuracy                           0.81       999
   macro avg       0.81      0.79      0.80       999
weighted avg       0.81      0.81      0.80       999

