<a href="https://colab.research.google.com/github/MarehWilliams01/Assessing-Xenophobia/blob/main/assessing_xenophobia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing the neccessary libraries
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression


nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# setting up the cleaned xenophobia dataset

data = pd.read_csv('/content/drive/MyDrive/Datasets/SA_XenoDataset_2017-2022.csv')
df = pd.DataFrame(data.drop(columns=[ 'Unnamed: 0','Likes', 'No of replies', 'Language', 'Coordinates', 'Number of Attacks','Sentiment', 'User Location' ,'District', 'Province'], axis=1)) #converting it to dataframe

# converting 'TimeCreated' and 'DateCreated' to be of the type datetime
# for 'TimeCreated' column
df['TimeCreated'] = pd.to_datetime(df['TimeCreated'])
df['TimeCreated'] = pd.to_datetime(df['TimeCreated'], format='%H:%M:%S').dt.time

# for 'DateCreated' column
df['DateCreated'] = pd.to_datetime(df['DateCreated'])
df['DateCreated'] = df['DateCreated'].dt.date

# combine 'DateCreated' and 'TimeCreated' columns into a single datetime column
df['DateTimeCreated'] = pd.to_datetime(df['DateCreated'].astype(str) + ' ' + df['TimeCreated'].astype(str))

# xxtract year from 'DateTimeCreated' and create a new column
df['Year'] = df['DateTimeCreated'].dt.year

# dropping rows with NaN Inputs
df= df.dropna()

df

In [None]:
# preprocessing tweets
df['CleanedTweet'].apply(type)

In [None]:
# developing a dictionary for shorthand texts
# send a GET request
url = "https://messente.com/blog/text-abbreviations"
response = requests.get(url)

# parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# find the second and third b tags and extract the data
b_tags = soup.find_all("b")
second_b_tag = b_tags[1]
third_b_tag = b_tags[2]

# find all paragraphs inside the second and third b tags, excluding the first paragraph
second_paragraphs = second_b_tag("p")[1:]
third_paragraphs = third_b_tag("p")[1:]

# extract the slang words and descriptions
slang_list = []
description_list = []
count = 0

for paragraph in second_paragraphs + third_paragraphs:
  if count <= 99:
    split_text = paragraph.text.strip().split(" â€“ ", 1)
    slang = split_text[0].split(". ", 1)[-1].lower()
    description = split_text[1] if len(split_text) > 1 else ""
    slang_list.append(slang)
    description_list.append(description)
    count += 1

  else:
    break

# create a DataFrame from the extracted data
df_slangs = pd.DataFrame({
    "slang": slang_list,
    "description": description_list
})

# print the DataFrame
print(df_slangs)

In [None]:
# preprocessing the data

# developing a function to clean the data
def clean_tweet(text):
  words = text.split() # splits text into each word
  normalized_words = [df_slangs.loc[df_slangs['slang'] == word, "description"].values[0]
    if word in df_slangs['slang'].values else word for word in words] # removes slangs
  text = " ".join(normalized_words)

  return text.lower()

# checking for NaN values in the 'CleanedTweet' column
df['CleanedTweet'] = df['CleanedTweet'].apply(clean_tweet)

df['CleanedTweet']

In [None]:
# lemmatization

nlp = spacy.load("en_core_web_sm")

# function to lemmatize each word in a tweet
def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

# apply lemmatization to the 'CleanedTweet' column
df['CleanedTweet'] = df['CleanedTweet'].apply(lemmatize_text)

df['CleanedTweet']

In [None]:
# removing stopwords

# getting the current list of English stopwords
stopword_list = stopwords.words('english')

# function to remove stopwords from tweet
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    text = ' '.join(filtered_tokens)
    return text

# apply stopword removal to the tweet column
df['CleanedTweet'] = df['CleanedTweet'].apply(remove_stopwords)

df['CleanedTweet']

In [None]:
# function to get sentiment using TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# apply the sentiment analysis function to the tweet column
df['Sentiment'] = df['CleanedTweet'].apply(get_sentiment)

df

In [None]:
# analyzing sentiment on xenophobia tweets over time

# seeing the different dates used in the dataset
# get unique time values
unique_times = df['DateCreated'].unique()

# display unique time values
for time_val in unique_times:
    print(time_val)


# grouping the data by year and sentiment, and count the sentiments
sentiment_counts = df.groupby(['Year', 'Sentiment']).size().unstack(fill_value=0)

# display the result
print(sentiment_counts)


In [None]:
# finding the average sentiment of xenophobia-related tweets over time in years

# function to map sentiment labels to numerical values
def map_sentiment_to_numeric(sentiment):
    if sentiment == 'Negative':
        return -1
    elif sentiment == 'Positive':
        return 1
    else:
        return 0

# apply the mapping function to the 'Sentiment' column and creating a new column
df['SentimentNumeric'] = df['Sentiment'].apply(map_sentiment_to_numeric)

# extract year from 'DateTimeCreated' and creating a new column
df['Year'] = df['DateTimeCreated'].dt.year

# grouping the data by year and sentiment, and calculating the most common sentiment
sentiment_counts = df.groupby(['Year', 'SentimentNumeric']).size().unstack(fill_value=0)
most_common_sentiment = sentiment_counts.idxmax(axis=1)

# creating the line chart
plt.figure(figsize=(10, 6))
most_common_sentiment.plot(kind='line', marker='o')
plt.title('Average Sentiment of Xenophobia-Related Tweets Over Time')
plt.xlabel('Year')
plt.ylabel('Average Sentiment')
plt.grid(True)
plt.show()


In [None]:
# assessing long-term trends

# calculate sentiment proportions for each year
sentiment_proportions = df.groupby(['Year', 'Sentiment']).size().unstack(fill_value=0).div(df.groupby('Year').size(), axis=0)

# for neutral sentiments
# reshape the data for linear regression
X = sentiment_proportions.index.values.reshape(-1, 1)
y = sentiment_proportions['Neutral']  # Replace 'Negative' with 'Positive' or 'Neutral' as needed

# fit linear regression model
model = LinearRegression()
model.fit(X, y)

# predict sentiment proportions using the model
predicted_sentiment_proportions = model.predict(X)

# plot the results
plt.figure(figsize=(10, 6))
plt.scatter(sentiment_proportions.index, sentiment_proportions['Neutral'], color='blue', label='Actual')
plt.plot(sentiment_proportions.index, predicted_sentiment_proportions, color='red', label='Trend Line')
plt.xlabel('Year')
plt.ylabel('Proportion of Neutral Sentiments')
plt.title('Long-Term Sentiment Trends')
plt.legend()
plt.show()


# for negative sentiments
# reshape the data for linear regression
X = sentiment_proportions.index.values.reshape(-1, 1)
y = sentiment_proportions['Negative']  # Replace 'Negative' with 'Positive' or 'Neutral' as needed

# fit linear regression model
model = LinearRegression()
model.fit(X, y)

# predict sentiment proportions using the model
predicted_sentiment_proportions = model.predict(X)

# plot the results
plt.figure(figsize=(10, 6))
plt.scatter(sentiment_proportions.index, sentiment_proportions['Negative'], color='blue', label='Actual')
plt.plot(sentiment_proportions.index, predicted_sentiment_proportions, color='red', label='Trend Line')
plt.xlabel('Year')
plt.ylabel('Proportion of Negative Sentiments')
plt.title('Long-Term Sentiment Trends')
plt.legend()
plt.show()


# for positive sentiments
# reshape the data for linear regression
X = sentiment_proportions.index.values.reshape(-1, 1)
y = sentiment_proportions['Positive']  # Replace 'Negative' with 'Positive' or 'Neutral' as needed

# fit linear regression model
model = LinearRegression()
model.fit(X, y)

# predict sentiment proportions using the model
predicted_sentiment_proportions = model.predict(X)

# plot the results
plt.figure(figsize=(10, 6))
plt.scatter(sentiment_proportions.index, sentiment_proportions['Positive'], color='blue', label='Actual')
plt.plot(sentiment_proportions.index, predicted_sentiment_proportions, color='red', label='Trend Line')
plt.xlabel('Year')
plt.ylabel('Proportion of Positive Sentiments')
plt.title('Long-Term Sentiment Trends')
plt.legend()
plt.show()



In [None]:
# extract month
df['Month'] = df['DateTimeCreated'].dt.month

# set a threshold for detecting significant changes
threshold = 0.05

# creating a subplot for each year
years = df['Year'].unique()
num_years = len(years)
fig, axes = plt.subplots(nrows=num_years, ncols=1, figsize=(10, 6 * num_years))

# looping through each year
for i, year in enumerate(years):
    # filter data for the current year
    year_df = df[df['Year'] == year]

    # calculating sentiment distribution for each month
    sentiment_distribution = year_df.groupby('Month')['SentimentNumeric'].mean()
    sentiment_distribution_percentage = sentiment_distribution / sentiment_distribution.sum()

    # calculating overall sentiment distribution
    overall_sentiment_distribution = sentiment_distribution.sum() / sentiment_distribution.sum()

    # identify months with notable changes in sentiment distribution
    notable_months = sentiment_distribution_percentage[
        (sentiment_distribution_percentage - overall_sentiment_distribution).abs() > threshold
    ]

    # plot the sentiment distribution and mark notable months
    ax = axes[i]
    ax.scatter(notable_months.index, notable_months, color='red', label='Notable Months', s=50)
    ax.set_xlabel('Month')
    ax.set_ylabel('Sentiment Distribution')
    ax.set_title(f'Notable Sentiment Distribution for Year {year}')
    ax.set_xticks(range(1, 13))
    ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=0)
    ax.legend()
    ax.grid(True)

plt.tight_layout()
plt.show()


In [None]:
# group the data by year and sentiment, and calculate the most common sentiment
sentiment_counts = df.groupby(['Tweet Origin', 'Sentiment']).size().unstack(fill_value=0)
most_common_sentiment = sentiment_counts.idxmax(axis=1)

# create a bar plot to visualize sentiment variations across regions
plt.figure(figsize=(50, 20))
plt.scatter(most_common_sentiment.index, most_common_sentiment.values, color='skyblue')
plt.xlabel('Region', fontsize=14)
plt.ylabel('Average Sentiment Score', fontsize=14)
plt.title('Sentiment Variations Across Different Regions', fontsize=16)
plt.xticks(rotation=45, fontsize=17)
plt.yticks(fontsize=17)
plt.tight_layout()

# show the plot
plt.show()

In [None]:
# defining high and low sentiment thresholds
high_sentiment_threshold = 0.5
low_sentiment_threshold = -0.5

# segment data into high and low sentiment periods
high_sentiment_periods = df[df['SentimentNumeric'] > high_sentiment_threshold]
low_sentiment_periods = df[df['SentimentNumeric'] < low_sentiment_threshold]

# function to calculate user engagement metrics
def calculate_engagement_metrics(data):
    avg_sentiment = data['SentimentNumeric'].mean()
    retweet_count = data['Retweets'].mean()
    positive_message_share = data[data['Sentiment'] == 'Positive'].shape[0] / data.shape[0]
    return avg_sentiment, retweet_count, positive_message_share

# calculate engagement metrics for high sentiment periods
high_avg_sentiment, high_retweet_count, high_positive_share = calculate_engagement_metrics(high_sentiment_periods)

# calculate engagement metrics for low sentiment periods
low_avg_sentiment, low_retweet_count, low_positive_share = calculate_engagement_metrics(low_sentiment_periods)

# create a bar chart for visualization
metrics = ['Average Sentiment', 'Average Retweets', 'Positive Message Share']
high_values = [high_avg_sentiment, high_retweet_count, high_positive_share]
low_values = [low_avg_sentiment, low_retweet_count, low_positive_share]

plt.figure(figsize=(10, 6))
plt.bar(metrics, high_values, color='blue', label='High Sentiment Periods')
plt.bar(metrics, low_values, color='red', label='Low Sentiment Periods', alpha=0.5)
plt.ylabel('Metrics')
plt.title('User Engagement Patterns During High and Low Sentiment Periods')
plt.legend()
plt.tight_layout()
plt.show()
