In [None]:
import pandas as pd
from datetime import timedelta


In [None]:
#Final Dataset of all articles about companies (without duplicates)

all_articles = pd.read_csv("FINAL_DATASET_FIX.csv", index_col=0)
all_articles

In [None]:
#Articles which mention a scandal in the title of the article 
scandal_title = all_articles[all_articles["Titlebased"]==1.0]
scandal_title.info()

In [None]:
scandal_title.to_csv("scandal_in_title.csv")

In [None]:
#Articles which mention a scandal in the text of the article 
scandal_text = all_articles[all_articles["Textbased"]==1.0]
scandal_text.info()

In [None]:
scandal_text.to_csv("scandal_in_text.csv")

In [None]:
#Articles which mention a scandal in the title AND text of the article (not used in the reseearch)
scandal_title_text = scandal_title[scandal_title["Textbased"]==1.0]
scandal_title_text.info()

# SEVERITY SCORE CREATION

## Severity Score for the Title Dataset

In [None]:
title_scandal = pd.read_csv("scandal_in_title.csv", index_col=0)
title_scandal.info()

In [None]:
# Convert date column to a date
title_scandal['Date'] = pd.to_datetime(title_scandal['Date'])

In [None]:

# Function to count additional articles in the following 14 days
def count_additional_articles(row, data):
    company = row['company_in_title']
    start_date = row['Date']
    end_date = start_date + timedelta(days=14)
    
    # Filter for articles about the same company within the next month
    mask = (title_scandal['company_in_title'] == company) & (title_scandal['Date'] > start_date) & (title_scandal['Date'] <= end_date)
    return title_scandal[mask].shape[0]



In [None]:
# Calculate severity scores for each articles
title_scandal['severity_score'] = title_scandal.apply(lambda row: count_additional_articles(row, title_scandal), axis=1)


In [None]:
#Find most severe articles
title_scandal[title_scandal['severity_score'] > 5].sort_values(by='Date')

In [None]:
title_scandal.to_csv("title_scandal_severity.csv")

## Severity Score for Text Dataset

In [None]:
text_scandal = pd.read_csv("scandal_in_text.csv", index_col=0)
text_scandal.info()

In [None]:


text_scandal['Date'] = pd.to_datetime(text_scandal['Date'])
title_scandal['Date'] = pd.to_datetime(title_scandal['Date'])

# Function to count additional articles in the following 14 days
def count_additional_articles(row, text_scandal):
    company = row['company_in_text']
    start_date = row['Date']
    end_date = start_date + timedelta(days=14)
    
    # Filter for articles about the same company within the next week
    mask = (text_scandal['company_in_text'] == company) & (text_scandal['Date'] > start_date) & (text_scandal['Date'] <= end_date)
    return text_scandal[mask].shape[0]


In [None]:

# Apply the function to the scandal articles DataFrame to calculate severity scores
title_scandal['severity_score'] = title_scandal.apply(lambda row: count_additional_articles(row, text_scandal), axis=1)



In [None]:
text_based_scandal = title_scandal

In [None]:
text_based_scandal[text_based_scandal['severity_score'] > 10].sort_values(by='Date')

In [None]:
text_based_scandal.to_csv("text_scandal_severity.csv")

## Severity Score Based on All Company Articles

In [None]:

all_articles['Date'] = pd.to_datetime(all_articles['Date'])
title_scandal['Date'] = pd.to_datetime(title_scandal['Date'])

# Function to count additional articles in the following 14 days
def count_additional_articles(row, text_scandal):
    company = row['company_in_title']
    start_date = row['Date']
    end_date = start_date + timedelta(days=14)
    
    # Filter for articles about the same company within the next 14 days
    mask = (all_articles['company_in_title'] == company) & (all_articles['Date'] > start_date) & (all_articles['Date'] <= end_date)
    return all_articles[mask].shape[0]


In [None]:

# Apply the function to the scandal articles DataFrame to calculate severity scores
title_scandal['severity_score'] = title_scandal.apply(lambda row: count_additional_articles(row, all_articles), axis=1)



In [None]:
all_based_scandal = title_scandal

In [None]:
all_based_scandal[all_based_scandal['severity_score'] > 23].sort_values(by='Date')

In [None]:
all_based_scandal.to_csv("all_scandal_severity.csv")

# Plot with Volume of Companies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import ast
from matplotlib.ticker import FuncFormatter

# Load and sort the scandal data
all_articles = pd.read_csv('FINAL_DATASET_FIX.csv')
scandal_articles_df = scandal_articles_df.sort_values(by='Date', ascending=True)
scandal_articles_df.info()

In [None]:

# Load the stock data (Check the excel to know what is what company)
stock_data = pd.read_excel("../Stock_FINAL.xlsx", sheet_name = 1)
stock_data



## We plot for Apple in this case

### To plot for other companies, just change the sheet number and company name in the code
#### Check severity score of the most severe articles to check which period to plot 

In [None]:

title_scandal_df = pd.read_csv('title_scandal_severity.csv')
title_scandal_df['Date'] = pd.to_datetime(title_scandal_df['Date'])
title_scandal_df['company_in_title'] = title_scandal_df['company_in_title'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])


text_scandal_df = pd.read_csv('text_scandal_severity.csv')
text_scandal_df['Date'] = pd.to_datetime(text_scandal_df['Date'])
text_scandal_df['company_in_text'] = text_scandal_df['company_in_text'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])


all_articles_df = pd.read_csv('final_data_with_count_and_annotation.csv')
all_articles_df['Date'] = pd.to_datetime(all_articles_df['Date'])
all_articles_df['company_in_title'] = all_articles_df['company_in_title'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
all_articles_df['company_in_text'] = all_articles_df['company_in_text'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])


# Filter for Apple-related articles
title_enron = title_scandal_df[title_scandal_df['company_in_title'].apply(lambda x: 'Apple' in x)]
text_enron = all_articles_df[all_articles_df['company_in_text'].apply(lambda x: 'Apple' in x)]
all_enron = all_articles_df[all_articles_df['company_in_title'].apply(lambda x: 'Apple' in x)]

# Filter dates
start_date = pd.to_datetime('2014-08-01').date()
end_date = pd.to_datetime('2014-11-01').date()

title_enron['Date'] = title_enron['Date'].dt.date
text_enron['Date'] = text_enron['Date'].dt.date
all_enron['Date'] = all_enron['Date'].dt.date

title_enron = title_enron[(title_enron['Date'] >= start_date) & (title_enron['Date'] <= end_date)]
text_enron = text_enron[(text_enron['Date'] >= start_date) & (text_enron['Date'] <= end_date)]
all_enron = all_enron[(all_enron['Date'] >= start_date) & (all_enron['Date'] <= end_date)]

# Load stock data
stock_data = pd.read_excel('../Stock_Final.xlsx', sheet_name=5)
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce').dt.date

# Drop rows with invalid dates or missing stock prices
stock_data.dropna(subset=['Date', 'PX_VOLUME'], inplace=True)


stock_data = stock_data[(stock_data['Date'] >= start_date) & (stock_data['Date'] <= end_date)]

# Count the number of articles per day for each dataset
articles_per_day_title = title_enron.groupby('Date').size().reset_index(name='Article_Count_Title')
articles_per_day_text = text_enron.groupby('Date').size().reset_index(name='Article_Count_Text')
articles_per_day_all = all_enron.groupby('Date').size().reset_index(name='Article_Count_All')

# Merge the trading volume data with the articles count data
merged_df = pd.merge(stock_data, articles_per_day_title, on='Date', how='left').fillna(0)
merged_df = pd.merge(merged_df, articles_per_day_text, on='Date', how='left').fillna(0)
merged_df = pd.merge(merged_df, articles_per_day_all, on='Date', how='left').fillna(0)


def millions(x, pos):
    return '%1.0fM' % (x * 1e-6)

# Create a formatter for the y-axis
formatter = FuncFormatter(millions)

# Plotting
fig, ax1 = plt.subplots(figsize=(14, 8))


ax1.bar(merged_df['Date'], merged_df['PX_VOLUME'], color='#a2cffe', alpha=0.6, label='Trading Volume')
ax1.yaxis.set_major_formatter(formatter)  # Apply the formatter to the y-axis


ax2 = ax1.twinx()
ax2.plot(merged_df['Date'], merged_df['Article_Count_Title'], color='#ffb347', alpha=0.8, label='Scandal Articles based on Title', linewidth=2)


ax2.plot(merged_df['Date'], merged_df['Article_Count_Text'], color='#ff6961', alpha=0.8, label='Scandal Articles based on Text', linewidth=2)


ax2.plot(merged_df['Date'], merged_df['Article_Count_All'], color='#c3a284', alpha=0.8, label='All Articles', linewidth=2)


ax1.set_xlabel('Date')
ax1.set_ylabel('Trading Volume (Millions)', color='#1f77b4')
ax2.set_ylabel('Number of Articles')
ax2.set_ylim(bottom=0)  # Ensure y-axis starts at 0
plt.title('Apple Trading Volume and Number of Articles Per Day')


ax1.legend(loc='upper left')
ax2.legend(loc='upper right')


plt.show()


# Plot Descriptive Statistics

In [None]:
import pandas as pd
import numpy as np
import ast
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re

# Load the data from the provided CSV file to inspect the first few rows and the data structure
file_path = 'scandal_in_title.csv'
all_art = pd.read_csv(file_path)
all_art.info()

In [None]:
# Function to transform list strings into comma-separated words
def transform_list_string(column):
    return column.apply(lambda x: ', '.join(ast.literal_eval(x)) if pd.notnull(x) else x)

# Identify columns to transform
columns_to_transform = ['words', 'company_in_title', 'company_in_text', 'scandal_in_title', 'scandal_in_text']

# Apply transformation
for col in columns_to_transform:
    if col in all_art.columns:
        all_art[col] = transform_list_string(all_art[col])


In [None]:
all_art['company_in_title'] = all_art['company_in_title'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
data_exploded = all_art.explode('company_in_title')

# Count the occurrences of each company
company_counts = data_exploded['company_in_title'].value_counts()

# Count the occurrences of each company
company_counts = data_exploded['company_in_title'].value_counts()

# Select the top 15 companies with the most occurrences
top_15_companies = company_counts.head(15)

# Define shades of blue (reversed to make the most cited one the darkest)
blue_shades = [
    '#004040',  
    '#005560', 
    '#006C80', 
    '#0083A0', 
    '#009AC0',  
    '#00B1FF',  
    '#1AB9FF',  
    '#33C0FF',  
    '#4CC8FF', 
    '#66CFFF',  
    '#7FD7FF',  
    '#99DFFF',  
    '#B2E7FF',  
    '#CCEFFF',  
    '#E6F7FF'  
]


colors = blue_shades[:len(top_15_companies)]


plt.figure(figsize=(10, 8))
plt.pie(top_15_companies, labels=top_15_companies.index, autopct='%1.1f%%', colors=colors, startangle=90)
plt.title('Top 15 Companies with Most NYT Articles Related to Scandals')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.


plt.show()

In [None]:
import pandas as pd
import numpy as np
import ast
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import matplotlib.pyplot as plt



# Load the data from the provided CSV file to inspect the first few rows and the data structure
file_path = 'scandal_in_title.csv'
all_art = pd.read_csv(file_path)
all_art.info()

In [None]:

all_art['Year'] = pd.to_datetime(all_art['Date']).dt.year
# Create an empty dictionary to store the number of mentions for each company each year
yearly_company_counts = {}

# Iterate through each row of data
for index, row in all_art.iterrows():
    year = row['Year']  # Get the year
    companies_str = row['company_in_title']  # Get the company list string

    # Ensure companies_str is a string
    if not isinstance(companies_str, str):
        continue

    # Use regular expressions to extract company names, i.e., those contained within single quotes and square brackets
    companies_list = re.findall(r"'([^']*)'", companies_str)
    
   
    for company in companies_list:
        
        cleaned_company = company.strip().upper()
        
        if cleaned_company in yearly_company_counts:
            
            if year in yearly_company_counts[cleaned_company]:
                yearly_company_counts[cleaned_company][year] += 1
            else:
                yearly_company_counts[cleaned_company][year] = 1
        else:
            yearly_company_counts[cleaned_company] = {year: 1}


company_display_names = {
    'MICROSOFT': 'Microsoft',
    'GOOGLE': 'Google',
    'ENRON': 'Enron',
    'INTEL': 'Intel',
    'FACEBOOK': 'Facebook',
    'APPLE': 'Apple',
    'CITI': 'Citi',
    'BOFA': 'Bank of America',
    'AMAZON': 'Amazon',
    'BOEING': 'Boeing',
    'HEALTHSOUTH': 'HealthSouth'
}

selected_companies = ['MICROSOFT', 'GOOGLE', 'ENRON', 'INTEL', 'FACEBOOK', 'APPLE', 'CITI', 'BOFA', 'AMAZON', 'BOEING', 'HEALTHSOUTH']

df = pd.DataFrame.from_dict(yearly_company_counts, orient='index').fillna(0)

# Reorder columns in the desired order of company names
df = df.transpose()
sorted_df = df.sort_index(axis=0)

# Create an empty DataFrame to store the selected columns
selected_df = pd.DataFrame()

for company in selected_companies:
   
    if company in sorted_df.columns:
        # If it exists, add the column to the selected DataFrame
        selected_df[company_display_names[company]] = sorted_df[company]


normalized_df = selected_df.div(selected_df.sum(axis=1), axis=0)


base_color = '#1f77b4'
num_shades = len(selected_companies)
colors = [mcolors.to_hex(plt.cm.Blues(i / (num_shades - 1))) for i in range(num_shades)]
colors.reverse()  # Reverse the order to have the darkest color at the bottom


ax = selected_df.plot(kind='bar', stacked=True, figsize=(12, 7), color=colors)
ax.set_ylabel('Number of Mentions')
ax.set_xlabel('Year')
ax.set_title('Annual Count of NYT Articles on Companies with the Most Scandals')
plt.legend(title='Companies', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('selected_scandalous_companies_distribution.png')
plt.show()


In [None]:

# Extracting years
all_art['Year'] = pd.to_datetime(all_art['Date']).dt.year

# Create an empty dictionary to store the number of mentions for each company each year
yearly_company_counts = {}

# Iterate through each row of data
for index, row in all_art.iterrows():
    year = row['Year']  # Get the year
    companies_str = row['company_in_title']  # Get the company list string

    # Ensure companies_str is a string
    if not isinstance(companies_str, str):
        continue

    # Use regular expressions to extract company names, i.e., those contained within single quotes and square brackets
    companies_list = re.findall(r"'([^']*)'", companies_str)
    
    # Iterate through each extracted company name
    for company in companies_list:
        # Remove leading and trailing spaces, and convert to uppercase
        cleaned_company = company.strip().upper()
        # Check if the company is in yearly_company_counts
        if cleaned_company in yearly_company_counts:
            # Check if the year already exists in the count for that company
            if year in yearly_company_counts[cleaned_company]:
                yearly_company_counts[cleaned_company][year] += 1
            else:
                yearly_company_counts[cleaned_company][year] = 1
        else:
            yearly_company_counts[cleaned_company] = {year: 1}

# Map for displaying proper capitalization in the legend
company_display_names = {
    'MICROSOFT': 'Microsoft',
    'GOOGLE': 'Google',
    'ENRON': 'Enron',
    'INTEL': 'Intel',
    'FACEBOOK': 'Facebook',
    'APPLE': 'Apple',
    'CITI': 'Citi',
    'BOFA': 'Bank of America',
    'AMAZON': 'Amazon',
    'BOEING': 'Boeing',
    'HEALTHSOUTH': 'HealthSouth'
}

# Select the specific companies of interest
selected_companies = ['MICROSOFT', 'GOOGLE', 'ENRON', 'INTEL', 'FACEBOOK', 'APPLE', 'CITI', 'BOFA', 'AMAZON', 'BOEING', 'HEALTHSOUTH']

# Convert the dictionary to a DataFrame
df = pd.DataFrame.from_dict(yearly_company_counts, orient='index').fillna(0)

# Reorder columns in the desired order of company names
df = df.transpose()
sorted_df = df.sort_index(axis=0)

# Create an empty DataFrame to store the selected columns
selected_df = pd.DataFrame()

# Iterate through the selected companies list
for company in selected_companies:
    # Check if the company name column exists in the DataFrame
    if company in sorted_df.columns:
        # If it exists, add the column to the selected DataFrame
        selected_df[company_display_names[company]] = sorted_df[company]

# Normalize the data by dividing each company's count by the total count for the selected companies that year
normalized_df = selected_df.div(selected_df.sum(axis=1), axis=0)

# Generate a gradient of shades from the base color
base_color = '#1f77b4'
num_shades = len(selected_companies)
colors = [mcolors.to_hex(plt.cm.Blues(i / (num_shades - 1))) for i in range(num_shades)]
colors.reverse()  # Reverse the order to have the darkest color at the bottom

# Plot the stacked bar chart with normalized data and gradient of shades
ax = normalized_df.plot(kind='bar', stacked=True, figsize=(12, 7), color=colors)
ax.set_ylabel('Percentage of Mentions')
ax.set_xlabel('Year')
ax.set_title('Annual Distribution of WSJ Articles on Companies with the Most Scandals in Percentage')
plt.legend(title='Companies', bbox_to_anchor=(1.05, 1), loc='upper left')

# Set the y-axis labels to be percentages
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{int(x * 100)}%'))

plt.tight_layout()
plt.savefig('selected_scandalous_companies_distribution_percentage.png')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# Load your dataset (assuming the DataFrame is already loaded as 'all_art')
# Make sure the 'Date' and 'company_in_title' columns exist in the DataFrame
if 'Date' not in all_art.columns or 'company_in_title' not in all_art.columns:
    raise ValueError("The DataFrame must contain 'Date' and 'company_in_title' columns")

# Extracting years
all_art['Year'] = pd.to_datetime(all_art['Date']).dt.year

# Create an empty dictionary to store the number of articles each year
yearly_article_counts = {}

# Iterate through each row of data
for index, row in all_art.iterrows():
    year = row['Year']  # Get the year
    if year in yearly_article_counts:
        yearly_article_counts[year] += 1
    else:
        yearly_article_counts[year] = 1

# Convert the dictionary to a DataFrame
yearly_article_counts_df = pd.DataFrame(list(yearly_article_counts.items()), columns=['Year', 'Total Articles'])

# Sort the DataFrame by year
yearly_article_counts_df = yearly_article_counts_df.sort_values(by='Year')

# Plot the bar chart
plt.figure(figsize=(14, 8))
plt.bar(yearly_article_counts_df['Year'], yearly_article_counts_df['Total Articles'], color='#1f77b4')
plt.xlabel('Year')
plt.ylabel('Total Number of Articles')
plt.title('Total Number of WSJ Scandal Articles about Companies Published Per Year')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('total_scandal_articles_per_year.png')
plt.show()


## Distribution of top scandal terms

In [None]:
import pandas as pd
import numpy as np
import ast
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re


data = pd.read_csv("scandal_in_title.csv")
data

In [None]:
# Function to transform list strings into comma-separated words - easier to handle
def transform_list_string(column):
    return column.apply(lambda x: ', '.join(ast.literal_eval(x)) if pd.notnull(x) else x)

# Identify columns to transform
columns_to_transform = ['words', 'company_in_title', 'company_in_text', 'scandal_in_title', 'scandal_in_text']

# Apply transformation
for col in columns_to_transform:
    if col in data.columns:
        data[col] = transform_list_string(data[col])


In [None]:
import nltk
nltk.download('punkt')


In [None]:
# Convert 'Date' to datetime format and extract the year
data['Year'] = pd.to_datetime(data['Date']).dt.year

# Handle missing values in 'scandal_in_title' column
data['scandal_in_title'] = data['scandal_in_title'].fillna('')

# Create a list of scandal words from 'scandal_in_title' column
data['scandal_in_title'] = data['scandal_in_title'].apply(lambda x: x.split('__') if x != '' else [])

# Explode the dataframe so each scandal term has its own row
expanded_data = data.explode('scandal_in_title')

# Count the occurrences of each scandal term per year
count_per_year = expanded_data.groupby(['Year', 'scandal_in_title']).size().unstack(fill_value=0)

# Identify the top 10 scandal words by their total frequency over all years
top_scandals = count_per_year.sum(axis=0).nlargest(10).index

# Sort the scandal terms by total frequency from highest to lowest
sorted_scandals = count_per_year[top_scandals].sum().sort_values(ascending=False).index

# Define a base color
base_color = '#1f77b4'

# Plotting
plt.figure(figsize=(20, 8))
bar_width = 0.8
years = count_per_year.index
num_scandals = len(sorted_scandals)

# Initialize a base for the bar stacking
base = np.zeros(len(years))

# Loop to plot each scandal term, starting with the most frequent at the bottom
for i, scandal in enumerate(sorted_scandals):
    # Calculate the color intensity based on position, darker for higher frequency
    color_intensity = 1 - (i / num_scandals)  # Invert to have darker colors for more frequent terms
    color = plt.cm.Blues(color_intensity)  # Get color from the 'Blues' colormap

    # Plot the bar
    plt.bar(years, count_per_year[scandal], width=bar_width, label=scandal, color=color, bottom=base)

    # Update the base for the next bar
    base += count_per_year[scandal]

plt.title('Distribution of Top 10 Scandal Terms Over Time')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.legend(title='Scandal Terms', loc='upper left', bbox_to_anchor=(1.05, 1))
plt.grid(True, axis='y')
plt.tight_layout()

# Show the plot
plt.show()

## Plot the Frequency Plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from collections import Counter

# Take our dictionary
keywords = [
    "corrupt.*", "deceit.*", "decept.*", "deceiv.*", "betray.*", "shame.*", 
    "scandal.*", "dishonest.*", "misconduct.*", "fraud", "illegal.*", 
    "unethic.*", "violat.*", "falsify.*", "breach.*", "leak.*", "pollut.*", 
    "insecur.*", "irregular.*", "mismanag.*", "inappropriat.*", "unlaw.*", 
    "transgress.*", "noncomplian.*", "non-complian.*", "ghost.*", "malfeas.*", 
    "exploitat.*", "discriminat.*", "harass.*", "misrepresent.*", "embezzle.*", 
    "improper.*", "espion.*", "collus.*", "misus.*", "rigg.*", "kickback.*", 
    "retaliat.*", "moral lapse", "insider trading", "insider dealing", 
    "ponzi scheme", "arrest.*", "product recall", "privacy breach", 
    "privacy violation.*", "data leak", "intellectual property dispute", 
    "malpracti.*", "destruct.*", "unsustain.*", "ESG scandal", "fraudulent", 
    "deceptiv.*", "scamm.*", "briber.*", "bribe.*", "extort.*", 
    "misappropriat.*", "sabotag.*", "deforest.*", "habitat destruct.*", 
    "climate change deni.*", "tax evasion", "money launder.*", 
    "accounting scandal", "whistleblow.*", "sexual harass.*", 
    "workplace harass.*", "toxic culture", "data breach", "ransomware", 
    "drug recall", "clinical trial fraud", "off-label marketing", "antitrust", 
    "cartel", "monopoly", "litigat.*", "regulatory breach", "cover-up", 
    "settlement", "lawsuit", "penalt.*"
]

# Compile the regular expressions for efficiency
keyword_patterns = [re.compile(keyword, re.IGNORECASE) for keyword in keywords]

# Initialize a counter for the keywords
keyword_counter = Counter()

# Iterate through the titles and count keyword matches
for title in data['Title']:
    for pattern in keyword_patterns:
        if pattern.search(title):
            keyword_counter[pattern.pattern] += 1

# Convert the counter to a DataFrame for plotting
scandal_df = pd.DataFrame.from_dict(keyword_counter, orient='index', columns=['Count']).reset_index()
scandal_df.rename(columns={'index': 'Scandal'}, inplace=True)
scandal_df = scandal_df.sort_values(by='Count', ascending=False)

# Plotting the bar chart
plt.figure(figsize=(20, 10))  # Increased figure size for better readability
plt.bar(scandal_df['Scandal'], scandal_df['Count'], color='#a2cffe', alpha=0.8)
plt.xlabel('Scandal Words', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('Most Used Scandal Words in Titles', fontsize=16)
plt.xticks(rotation=90, ha='center', fontsize=12)  # Changed rotation and increased font size
plt.tight_layout()

# Display plot
plt.show()


## Plot the Occurence Plot

In [None]:
import pandas as pd
import numpy as np
import ast
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import seaborn as sns
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer



data = pd.read_csv("scandal_in_title.csv")
data

In [None]:
# Function to transform list strings into comma-separated words - easier to handle --> easier for handling
def transform_list_string(column):
    return column.apply(lambda x: ', '.join(ast.literal_eval(x)) if pd.notnull(x) else x)


columns_to_transform = ['words', 'company_in_title', 'company_in_text', 'scandal_in_title', 'scandal_in_text']


for col in columns_to_transform:
    if col in data.columns:
        data[col] = transform_list_string(data[col])


In [None]:


titles = data[['Title', 'company_in_title', 'scandal_in_title']]
scandal_titles = titles[titles['scandal_in_title'].notna()]

# Parse and tokenize the scandal-related words
def tokenize_scandal_words(scandal_str):
    return scandal_str.replace('__', ' ').replace(',', ' ').split()

scandal_titles['scandal_in_title'] = scandal_titles['scandal_in_title'].apply(lambda x: tokenize_scandal_words(x) if isinstance(x, str) else [])

all_scandals = [scandal for sublist in scandal_titles['scandal_in_title'] for scandal in sublist]
unique_scandals = list(set(all_scandals))

vectorizer = CountVectorizer(vocabulary=unique_scandals, lowercase=True, binary=True)
X = vectorizer.fit_transform(scandal_titles['Title'])


company_words = defaultdict(Counter)

for index, row in scandal_titles.iterrows():
    companies = row['company_in_title'].split(', ')
    title_vector = X[index].toarray()[0]
    for company in companies:
        company_words[company].update(dict(zip(unique_scandals, title_vector)))


company_counts = Counter()
for companies in scandal_titles['company_in_title']:
    for company in companies.split(', '):
        company_counts[company] += 1

top_companies = [company for company, count in company_counts.most_common(11)]


plot_data = []
for company in top_companies:
    top_words = company_words[company].most_common(11)  # Get top 10 scandal words for each company
    for word, freq in top_words:
        plot_data.append([company, word, freq])

df_plot = pd.DataFrame(plot_data, columns=['Company', 'Word', 'Frequency'])


df_plot['Frequency'] = df_plot['Frequency'] / df_plot.groupby('Company')['Frequency'].transform('sum')

plt.figure(figsize=(14, 10))
bubble_plot = sns.scatterplot(data=df_plot, x='Company', y='Word', size='Frequency', hue='Company', legend=False, sizes=(20, 2000), alpha=0.6)

plt.title('Top 10 Scandal Words in Titles Associated with Top 10 Companies')
plt.xlabel('Company')
plt.ylabel('Word')
plt.xticks(rotation=90)
plt.grid(True)
plt.tight_layout()

plt.show()


In [None]:
scandals = pd.read_csv("Scandal_in_title.csv", index_col=0)
scandals

## Machine Learning Verification

In [None]:
annoted = pd.read_csv("ScandalsRevised.csv", index_col=0)
annoted

In [None]:
merged_df = pd.merge(scandals, annoted, left_on='Title', right_on='text', how='inner')
merged_df

In [None]:
merged_df = merged_df.drop(columns=['Unnamed: 2','Date_y','Comments'])

In [None]:
merged_df

In [None]:
merged_df.to_csv("ScandalsRevised1.csv")

In [None]:
import pandas as pd
import numpy as np
import nltk
import gensim
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE
from gensim.models import KeyedVectors
import seaborn as sns
import matplotlib.pyplot as plt

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:

# Load pre-trained Word2Vec model (Google's pre-trained model)
word2vec_path = '../GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

 

In [None]:
# Custom transformer to preprocess text data and convert to average Word2Vec vectors
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec, vector_size=300):
        self.word2vec = word2vec
        self.vector_size = vector_size
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        processed_docs = X.apply(self._preprocess)
        return np.array([
            np.mean([self.word2vec[word] for word in words if word in self.word2vec]
                    or [np.zeros(self.vector_size)], axis=0)
            for words in processed_docs.str.split()
        ])

    def _preprocess(self, doc):
        doc = re.sub('[^a-zA-Z]', ' ', doc)
        doc = doc.lower().split()
        doc = [self.lemmatizer.lemmatize(word) for word in doc if word not in self.stop_words]
        return ' '.join(doc)

# Load the labeled dataset
labeled_df = pd.read_csv('ScandalsRevised1.csv')

# Preprocess the dataset
labeled_df = labeled_df[["GOID", "text", "label"]]
labeled_df['label'] = labeled_df['label'].fillna('N')

X = labeled_df['text']
y = labeled_df['label']

# Ensure all values in X are strings
X = X.astype(str)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.31, random_state=42)

# Transform the text data to Word2Vec vectors
word2vec_transformer = Word2VecTransformer(word2vec)
X_train_vectors = word2vec_transformer.transform(X_train)
X_test_vectors = word2vec_transformer.transform(X_test)

# Apply SMOTE to the numerical vectors to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectors, y_train)

# Define pipeline for the Logistic Regression classifier
PipeLR = Pipeline([
    ("clf", LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# Train the model on the resampled training data
PipeLR.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
PredictLR = PipeLR.predict(X_test_vectors)


In [None]:

# Print accuracy scores and classification reports
print("LogisticRegression Accuracy:", accuracy_score(y_test, PredictLR))
print("Classification Report for LogisticRegression:")
print(classification_report(y_test, PredictLR))

print("Confusion Matrix for LogisticRegression:")
cm_lr = confusion_matrix(y_test, PredictLR)
print(cm_lr)


In [None]:

# Plot the confusion matrix for LogisticRegression
plt.figure(figsize=(10,7))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', xticklabels=['N', 'Y'], yticklabels=['N', 'Y'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for LogisticRegression')
plt.show()
