# UFCFVQ-15-M Programming for Data Science (Autumn 2023)

## Student Id: 

# Programming Task 2

### Requirement FR6 - Read CSV data from two files and merge it into a single Data Frame 

In [None]:
# Functional Requirement #6
# Using the Pandas library to read and merge both CSV files into one based on 'Entity' and 'Year'.
import pandas as pd

def merge_csv_files(file_path1, file_path2):
    # Read the first CSV file into a Pandas DataFrame
    data_file1 = pd.read_csv(file_path1)
    
    # Read the second CSV file into another Pandas DataFrame
    data_file2 = pd.read_csv(file_path2)
    
    # Merge the two DataFrames based on 'Entity' and 'Year'
    merged_data_file = pd.merge(data_file1, data_file2, on=['Entity', 'Year'])
    
    return merged_data_file

# Test Functional Requirement #6
merged_data = merge_csv_files('task2a.csv', 'task2b.csv')
print(merged_data.head())


### Requirement FR7 - Explore the dataset to identify an "interesting" pattern or trend

In [None]:
# Functional Requirement #7
# Using both Matplotlib and Seaborn for plotting different types of visualizations
# (line plot, heatmap, and boxplot).

import matplotlib.pyplot as plt
import seaborn as sns

# Visualization 1: Line plot for Schizophrenia (%) over the years
plt.figure(figsize=(10, 6))
sns.lineplot(x='Year', y='Schizophrenia (%)', hue='Entity', data=merged_data)
plt.title('Line Plot: Schizophrenia (%) Over the Years')
plt.show()

# Visualization 2: Correlation heatmap
# Selecting only numeric columns for correlation
numeric_columns = merged_data.select_dtypes(include=['float64', 'int64']).columns

# Creating a correlation matrix
correlation_matrix = merged_data[numeric_columns].corr()

# Plotting the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Visualization 3: Boxplot for Drug use disorders (%)
plt.figure(figsize=(10, 6))
sns.boxplot(x='Entity', y='Drug use disorders (%)', data=merged_data)
plt.title('Boxplot: Distribution of Drug use disorders (%)')
plt.xticks(rotation=45, ha='right')
plt.show()


### Requirement FR8 - Detect and remove any outliers in the data used for your "interesting" pattern or trend

In [None]:
from scipy.stats import zscore

# Data Preprocessing
# Use z-score to identify and remove outliers in the dataset. 
# The binomial distribution helps assess the normalization of 
# the data distribution. Visualizing the distribution through a 
# histogram aids in outlier identification. Outliers are determined 
# based on their z-scores, and the resulting data is filtered accordingly.

def remove_outliers(data_frame, column_name):
    # Calculate z-scores for the specified column
    z_scores = zscore(data_frame[column_name])
    
    # Identify outliers based on the z-score
    outliers = (z_scores > 3) | (z_scores < -3)
    
    # Remove outliers from the data
    data_no_outliers = data_frame[~outliers]
    
    return data_no_outliers

# Test Data Preprocessing
# Remove outliers for 'Schizophrenia (%)'
merged_data_no_outliers = remove_outliers(merged_data, 'Schizophrenia (%)')


### Requirement FR9 - Define a hypothesis to test your “interesting” pattern or trend

In [None]:
# Test Hypothesis (FR9)
# Hypotheses are used to evaluate the validity of results in a study. 
# An assumption is made about the potential truth, and statistical analysis 
# is performed to determine the accuracy of the results.

# In this case, we are testing the hypothesis that there is a significant 
# difference in 'Schizophrenia (%)' between different entities.

# Hypothesis:
# Null Hypothesis (H0): There is no significant difference in 'Schizophrenia (%)' between different entities.
# Alternative Hypothesis (H1): There is a significant difference in 'Schizophrenia (%)' between different entities.


### Requirement FR10 - Test your hypothesis with statistical significance level of 0.05

In [None]:
# Test Hypothesis (FR10)
# Let's test the previously formulated hypothesis using an ANOVA test.

from scipy.stats import f_oneway

# Grouping data by 'Category' (previously 'Entity'), extracting 'Schizophrenia (%)', and ignoring missing values
category_groups = [group[1]['Schizophrenia (%)'].dropna() for group in merged_data.groupby('Category')]

# Perform ANOVA test
# f_oneway is used for ANOVA test to check if means of groups are equal
f_statistic, p_value = f_oneway(*category_groups)

print(f'ANOVA F-statistic: {f_statistic}')
print(f'ANOVA P-value: {p_value}')

# Interpret the result
alpha = 0.05
if p_value < alpha:
    print('Reject the null hypothesis. There is a significant difference in Schizophrenia percentages between categories.')
else:
    print('Fail to reject the null hypothesis. No significant difference in Schizophrenia percentages between categories.')
