# Correlation Analysis
*Master "Applied Data Science" @ Nordakademie*
*Modul: Text Analytics*

### General pre-work

In [38]:
# Import necessary libraries

# Overall needed
import pandas as pd # to work with data frames
from scipy.stats import chi2_contingency

# Needed for visualisations and smaller functions such as time tracking or saving files
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import os # to define a dedicated output folder

In [39]:
output_folder = 'data/analysis' # refer to a new folder to store only sentiment result files

# check if folder can be found, else create it
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Add the normal distribution stock values to the combined sentiment table

csv_file_path_normal_3 = 'data/transfer/cleaned_articles_normalverteilt_3.csv' # define the path of the input csv file with 3 categories, normal distribution

# Load the data set
df_3categories_normal = pd.read_csv(csv_file_path_normal_3, encoding='utf-8', quoting=csv.QUOTE_ALL) # ensuring the right encoding as in the csv file we still encounter incorrectly encoded special characters

csv_file_path_combined_sentiments = 'data/sentiment_results/sentiment_results_combined.csv' # define the path of the input csv file with the sentiment results

# Load the data set
df_combined_sentiments = pd.read_csv(csv_file_path_combined_sentiments, encoding='utf-8', quoting=csv.QUOTE_ALL) # ensuring the right encoding as in the csv file we still encounter incorrectly encoded special characters

## Pre-Processing

In [40]:
# Merge the two data frames based on a common column, such as an index or a specific column name
# In this example, I assume you want to use the index as the merge key
df_merged_3_categories = df_combined_sentiments.merge(df_3categories_normal[['Stock_ValueStd']], left_index=True, right_index=True)

# Save the merged data frame to a new variable or file if needed
# merged_df contains the combined data with the "Stock_ValueStd" column added

# If you want to save the merged data frame to a new CSV file, you can use the following:
# merged_df.to_csv('merged_data.csv', index=False)

print(df_merged_3_categories.head(3))

   Row_Number                   Unternehmen Newstyp   Quelle Nearest_Date  \
0           1  Porsche Automobil Holding SE    News  onvista   2021-06-01   
1           2                    Beiersdorf    News  onvista   2021-06-02   
2           3          Heidelberg Materials    News  onvista   2021-06-02   

                                        Cleaned_Text Stock_Value  \
0  neu Rumor Porschebörsengang Sixt Berenberg stu...     neutral   
1  Beiersdorf Aktie Kaufempfehlung beflügeln Bere...    positive   
2  Heidelbergcement klimaneutral Zementwerk Weg B...    positive   

   TextBlob_Sentiment_Score TextBlob_Evenly_Separated_Label  \
0                   -0.0500                        Negative   
1                    0.2000                        Positive   
2                   -0.0875                        Negative   

  TextBlob_Normal_Distribution_Label  NLTK_Sentiment  \
0                            Neutral         -0.3612   
1                           Positive          0.0000 

In [41]:
#Rename and reposition columns

# 1. Rename column "Stock_Value" to "Stock_Value_evenly_separated"
df_merged_3_categories.rename(columns={'Stock_Value': 'Stock_Value_Evenly_Separated'}, inplace=True)

# 2. Rename column "Stock_ValueStd" to "Stock_Value_normal_distribution"
df_merged_3_categories.rename(columns={'Stock_ValueStd': 'Stock_Value_Normal_Distribution'}, inplace=True)

# 3. Reorder columns to move "Stock_Value_normal_distribution" to the 8th column
cols = df_merged_3_categories.columns.tolist()
cols.insert(7, cols.pop(cols.index('Stock_Value_Normal_Distribution')))
df_merged_3_categories = df_merged_3_categories[cols]

In [42]:
# Save the merged file in the correct repository folder

# Save to an Excel file
excel_output_file_analysis_3 = os.path.join(output_folder, 'analysis_3_categories.xlsx')
df_merged_3_categories.to_excel(excel_output_file_analysis_3, index=False)

# Save the results to a csv file with the same name
csv_output_file_analysis_3 = os.path.join(output_folder, 'analysis_3_categories.csv')
df_merged_3_categories.to_csv(csv_output_file_analysis_3, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

# Print final info
print(f"Results saved as {csv_output_file_analysis_3} and {excel_output_file_analysis_3}.")

Results saved as data/analysis\analysis_3_categories.csv and data/analysis\analysis_3_categories.xlsx.


## Correlation Analysis

### 3 Categories

#### TextBlob Even Separation

In [43]:
# Create a contingency table
contingency_table = pd.crosstab(df_merged_3_categories['Stock_Value_Evenly_Separated'], df_merged_3_categories['TextBlob_Evenly_Separated_Label'])

# Perform the chi-squared test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Output the chi-squared statistic and p-value
print(f"Chi-Squared Statistic: {chi2}")
print(f"P-Value: {p}")

# Interpret the results
alpha = 0.05  # Significance level
if p <= alpha:
    print("There is a significant association between the two categorical variables Stock_Value_Evenly_Separated and TextBlob_Evenly_Separated_Label.")
else:
    print("There is no significant association between the two categorical variables Stock_Value_Evenly_Separated and TextBlob_Evenly_Separated_Label.")

Chi-Squared Statistic: 74.37817173044427
P-Value: 2.6972703877652494e-15
There is a significant association between the two categorical variables Stock_Value_Evenly_Separated and TextBlob_Evenly_Separated_Label.
