**Sentiment Analysis**

In [5]:
import pandas as pd
from textblob import TextBlob


#Importing dataset
file_path = "cleaned_dataset.csv"
final_df = pd.read_csv(file_path)
df = final_df

# Function to get sentiment polarity from a text using TextBlob
def get_sentiment(text):
    analysis = TextBlob(str(text))
    # Classify the polarity as positive, negative, or neutral
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the sentiment analysis function to the 'tweet' column
df['sentiment'] = df['tweet'].apply(get_sentiment)

# Save the DataFrame with sentiment analysis results to a new CSV file
df.to_csv('output_dataframe_with_sentiment.csv', index=False)


# Display the DataFrame with the new 'sentiment' column
print(df[['tweet', 'sentiment']])


                                                  tweet sentiment
0     VAR . @var_austin.  VARSITY | AUSTIN REGIONAL ...   Neutral
1     Former Premier League ref Jeff Winter: "I woul...  Negative
2     VAR's fundamental problem is that the ref on t...  Positive
3     I wanted #VAR as I wanted the game to progress...  Positive
4     An issue with VAR is that it provides referees...  Positive
...                                                 ...       ...
3878  The VAR Show ( The View And Review Show ) - Th...   Neutral
3879  A reminder of this week's VAR Review: Arsenal'...  Positive
3880  After a VAR check the penalty is overturned. Q...   Neutral
3881  VAR is great. People in charge of VAR need rep...  Positive
3882                      Anything for the VAR Review?    Neutral

[3883 rows x 2 columns]


In [6]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Distribution')
plt.show()


ImportError: DLL load failed while importing _path: The specified module could not be found.

In [None]:
df['sentiment'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title('Sentiment Distribution')
plt.show()


In [None]:
from wordcloud import WordCloud

# Join all tweets into a single string
all_tweets = ' '.join(df['tweet'])

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110).generate(all_tweets)

# Plot the WordCloud image
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming df is your DataFrame with the 'sentiment' column
# Add sentiment analysis code here...

# Create a pie chart with percentages
plt.figure(figsize=(8, 8))
df['sentiment'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90)
plt.title('Sentiment Distribution')
plt.ylabel('')  # Remove the default 'sentiment' label on the y-axis
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'timestamp' is your timestamp column
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

# Resample data to visualize sentiment trends over time (e.g., weekly)
weekly_sentiments = df['sentiment'].resample('W').value_counts().unstack().fillna(0)

# Plotting
sns.lineplot(data=weekly_sentiments, markers=True)
plt.title('Sentiment Trends Over Time')
plt.show()

In [None]:
from collections import Counter

# Tokenize and count words
words = ' '.join(df['tweet']).split()
word_counts = Counter(words)

# Plot the top N words
top_words = word_counts.most_common(10)
sns.barplot(x=[word[0] for word in top_words], y=[word[1] for word in top_words])
plt.title('Top 10 Most Common Words')
plt.show()

In [None]:
user_sentiments = df.groupby('user')['sentiment'].value_counts().unstack().fillna(0)
user_sentiments.plot(kind='bar', stacked=True)
plt.title('Sentiment Distribution by User')
plt.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Assuming 'tweet' is your text column
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['tweet'])

lda = LatentDirichletAllocation(n_components=5, random_state=42)
topics = lda.fit_transform(X)

# Visualize topic distribution
sns.histplot(topics.argmax(axis=1), bins=range(lda.n_components + 1), discrete=True)
plt.title('Topic Distribution in Tweets')
plt.show()
