In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Import Dataset

In [None]:
df = pd.read_csv('/kaggle/input/tweets-and-user-engagement/Twitterdatainsheets.csv')

In [None]:
df.info

In [None]:
df.columns

In [None]:
# Clean up column names
df.columns = df.columns.str.strip()

# Check the cleaned column names
df.columns


## Data Preprocessing

In [None]:
# Check for missing values
missing_values = df.isnull().sum()

# Display data types
data_types = df.dtypes

# Display the results
print("Missing Values:")
print(missing_values)

print("\nData Types:")
print(data_types)


This code cell is performing an initial check on the dataset (df) to identify and analyze missing values. It also provides information about the data types of each column

In [None]:
# Drop rows with missing values for specific columns
df_cleaned = df.dropna(subset=['TweetID', 'Weekday', 'Hour', 'Day', 'Lang'])

# Display the shape of the cleaned DataFrame
print("Shape after handling missing values:", df_cleaned.shape)


This code cell is aimed at cleaning the dataset (df) by removing rows that have missing values in specific columns.

**Drop Rows with Missing Values:**

The dropna() method is used to remove rows with missing values from the specified columns ('TweetID', 'Weekday', 'Hour', 'Day', and 'Lang').
The cleaned DataFrame is stored in the variable df_cleaned.
Display the Shape of the Cleaned DataFrame:

The shape attribute is used to obtain the dimensions (number of rows and columns) of the cleaned DataFrame.
The results are printed to the console.

In [None]:
# Define numeric_columns based on the numeric columns in your DataFrame
numeric_columns = ['Hour', 'Day', 'IsReshare', 'Reach', 'RetweetCount', 'Likes', 'Klout', 'Sentiment', 'LocationID']

# Convert numeric columns to appropriate data types using .loc
df_cleaned.loc[:, numeric_columns] = df_cleaned[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Display the data types after conversion
print("\nData Types after conversion:")
print(df_cleaned.dtypes)

# Check for missing values again
missing_values_cleaned = df_cleaned.isnull().sum()
print("\nMissing Values after cleaning:")
print(missing_values_cleaned)




This code cell is responsible for converting the numeric columns in the cleaned DataFrame (df_cleaned) to appropriate data types using the .loc accessor.

In [None]:
# Impute missing values for numeric columns using .loc
df_cleaned.loc[:, numeric_columns] = df_cleaned.loc[:, numeric_columns].apply(lambda x: x.fillna(x.mean()))

# Display the DataFrame after imputation
print(df_cleaned.head())



This code cell is focused on imputing missing values in the numeric columns of the cleaned DataFrame (df_cleaned) using the mean value of each column.

## Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Basic statistics for numeric columns
numeric_stats = df_cleaned[numeric_columns].describe()
print("Basic Statistics for Numeric Columns:")
print(numeric_stats)

# Subsample the data (adjust n as needed)
df_subsample = df_cleaned.sample(n=5000)

# Correlation heatmap for numeric columns
plt.figure(figsize=(12, 8))
sns.heatmap(df_subsample[numeric_columns].corr(), annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap for Numeric Columns (Subsample)')
plt.show()

# Pairplot for numeric columns
sns.pairplot(df_subsample[numeric_columns])
plt.suptitle('Pairplot for Numeric Columns (Subsample)', y=1.02)
plt.show()




This code cell is responsible for generating visualizations and basic statistics for numeric columns in the cleaned DataFrame (df_cleaned). 

## Sentiment Analysis

In [None]:
pip install nltk


In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Initialize the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Analyze sentiment for each tweet
df_cleaned['SentimentScore'] = df_cleaned['text'].dropna().apply(lambda x: sia.polarity_scores(x)['compound'])

# Visualize sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['SentimentScore'], bins=30, kde=True)
plt.title('Distribution of Sentiment Scores in Tweets')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()


This code cell focuses on sentiment analysis of the tweets using the VADER (Valence Aware Dictionary and sEntiment Reasoner) sentiment analysis tool. Let's go through each part of the code:

Download VADER Lexicon:

The nltk.download('vader_lexicon') line downloads the VADER lexicon, a pre-built sentiment analysis lexicon used by the NLTK library.
Initialize SentimentIntensityAnalyzer:

The SentimentIntensityAnalyzer from the NLTK's VADER module is initialized. This analyzer provides a compound sentiment score for a given text.
Sentiment Analysis:

The sentiment of each tweet is analyzed using the VADER sentiment analyzer.
The compound sentiment score is obtained for each tweet's text by applying sia.polarity_scores(x)['compound'].
Visualize Sentiment Distribution:

A histogram is created using Seaborn (sns.histplot) to visualize the distribution of sentiment scores in the tweets.
The x-axis represents the sentiment scores, and the y-axis represents the frequency of tweets with a specific sentiment score.
The title, xlabel, and ylabel are set for better interpretation of the plot.

## Test Model

This code cell is designed to analyze the sentiment of a specific tweet in the dataset. Here's a breakdown of each part:

Choose a Specific Tweet:

The variable specific_tweet_index is set to a specific index value. You can change this value to the index of the tweet you want to analyze.
Retrieve Text of the Specific Tweet:

Using df_cleaned.loc[specific_tweet_index, 'text'], the text content of the specified tweet is extracted from the DataFrame.
Calculate Sentiment Score:

The VADER SentimentIntensityAnalyzer (sia) is employed to compute the sentiment score for the specific tweet.
The compound sentiment score is obtained by applying sia.polarity_scores(specific_tweet_text)['compound'].
Print Results:

The text content of the specific tweet is printed for reference.
The calculated sentiment score for the specific tweet is printed as well.

In [None]:
# Choose a specific tweet index
specific_tweet_index = 302  # Change this index to the tweet you want to analyze

# Get the text of the specific tweet
specific_tweet_text = df_cleaned.loc[specific_tweet_index, 'text']

# Calculate the sentiment score for the specific tweet
specific_tweet_sentiment_score = sia.polarity_scores(specific_tweet_text)['compound']

# Print the results
print(f"Text of the specific tweet:\n{specific_tweet_text}\n")
print(f"Sentiment Score for the specific tweet: {specific_tweet_sentiment_score}")


In [None]:
# Choose a specific tweet index
specific_tweet_index = 32  # Change this index to the tweet you want to analyze

# Get the text of the specific tweet
specific_tweet_text = df_cleaned.loc[specific_tweet_index, 'text']

# Calculate the sentiment score for the specific tweet
specific_tweet_sentiment_score = sia.polarity_scores(specific_tweet_text)['compound']

# Convert sentiment score to a categorical label
if specific_tweet_sentiment_score > 0:
    sentiment_label = 'positive'
elif specific_tweet_sentiment_score < 0:
    sentiment_label = 'negative'
else:
    sentiment_label = 'neutral'

# Print the results
print(f"Text of the specific tweet:\n{specific_tweet_text}\n")
print(f"Sentiment Score for the specific tweet: {specific_tweet_sentiment_score}")
print(f"Sentiment Label for the specific tweet: {sentiment_label}")


In [None]:
# Choose a specific tweet index
specific_tweet_index = 22588  # Change this index to the tweet you want to analyze

# Get the text of the specific tweet
specific_tweet_text = df_cleaned.loc[specific_tweet_index, 'text']

# Calculate the sentiment score for the specific tweet
specific_tweet_sentiment_score = sia.polarity_scores(specific_tweet_text)['compound']

# Convert sentiment score to a categorical label
if specific_tweet_sentiment_score > 0:
    sentiment_label = 'positive'
elif specific_tweet_sentiment_score < 0:
    sentiment_label = 'negative'
else:
    sentiment_label = 'neutral'

# Print the results
print(f"Text of the specific tweet:\n{specific_tweet_text}\n")
print(f"Sentiment Score for the specific tweet: {specific_tweet_sentiment_score}")
print(f"Sentiment Label for the specific tweet: {sentiment_label}")
