**Load the Dataset**

In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/sample_data/Twitter_Data.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.





**Text Preprocessing**

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset, ensuring the column with the text data is named correctly
file_path = '/content/sample_data/Twitter_Data.csv'
df = pd.read_csv(file_path)

# Check the actual column names in your DataFrame
print(df.columns)

# Function to preprocess text
def preprocess_text(text):
    # Handle potential non-string values
    if not isinstance(text, str):
        text = str(text)  # Convert to string if it's not already

    text = text.lower()  # Lowercase text
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#','', text)  # Remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    tokens = word_tokenize(text)  # Tokenize text
    filtered_words = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(filtered_words)

# Replace 'clean_text' with the correct name of the column containing the text data
df['cleaned_text'] = df['clean_text'].apply(preprocess_text)  # Update with the correct column name

# Display the first few rows of the dataframe with cleaned text
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Index(['clean_text', 'category'], dtype='object')


Unnamed: 0,clean_text,category,cleaned_text
0,when modi promised “minimum government maximum...,-1.0,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar names modi ...
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...


**Sentiment Analysis with TextBlob**

In [None]:
from textblob import TextBlob

# Function to get sentiment
def get_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        return 'positive'
    elif sentiment < 0:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis
df['sentiment'] = df['cleaned_text'].apply(get_sentiment)

# Display the first few rows with sentiment
df.head()


Unnamed: 0,clean_text,category,cleaned_text,sentiment
0,when modi promised “minimum government maximum...,-1.0,modi promised minimum government maximum gover...,negative
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi,neutral
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...,positive
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar names modi ...,positive
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...,positive


**Advanced Analysis with Machine Learning**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()
y = df['sentiment']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.7618419437967848
Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.44      0.59      6925
     neutral       0.76      0.83      0.79     12007
    positive       0.73      0.87      0.79     13664

    accuracy                           0.76     32596
   macro avg       0.80      0.71      0.73     32596
weighted avg       0.78      0.76      0.75     32596

