# MSA 2024 Phase 2 - Part 1

In [1]:
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Find all variables and understand them

In [2]:
output_file_path = 'D:/Download/store_sales_utf8.csv'
data = pd.read_csv(output_file_path, encoding='utf-8')

# 1. Find all variables and understand them
# Display the first ten instances
print(data.head(10))

# Display key statistical metrics
print(data.describe())

# Show data types of each column
print(data.dtypes)

## 2. Clean data

In [3]:
# 2. Clean data
# Handling missing values
data_cleaned = data.dropna()

numeric_cols = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
data_numeric = data_cleaned[numeric_cols]

# Handling outliers
Q1 = data_numeric.quantile(0.25)
Q3 = data_numeric.quantile(0.75)
IQR = Q3 - Q1
data_numeric = data_numeric[~((data_numeric < (Q1 - 1.5 * IQR)) | (data_numeric > (Q3 + 1.5 * IQR))).any(axis=1)]

# Normalizing data
scaler = StandardScaler()
data_numeric[numeric_cols] = scaler.fit_transform(data_numeric[numeric_cols])

data_cleaned = pd.concat([data_numeric, data_cleaned.drop(columns=numeric_cols)], axis=1)

# Display the cleaned data
print(data_cleaned.head(10))

## 3. Visualise data

In [4]:
# 3. Visualise data

# Bar chart for categorical variables
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data_cleaned[col].value_counts().plot(kind='bar')
    plt.title(f'{col} Bar Chart')
    plt.show()

# Histogram for numerical variables
for col in numeric_cols:
    data_cleaned[col].hist(bins=30)
    plt.title(f'{col} Histogram')
    plt.show()

# Box plot for numerical variables
for col in numeric_cols:
    sns.boxplot(x=data_cleaned[col])
    plt.title(f'{col} Box Plot')
    plt.show()

## 4. Identify correlated variables

In [5]:
# 4. Identify correlated variables
# Compute correlation matrix
correlation_matrix = data_numeric.corr()

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Select most relevant features
target_column = 'your_target_column'  # Replace with your actual target column
threshold = 0.5
if target_column in correlation_matrix.columns:
    relevant_features = correlation_matrix[abs(correlation_matrix[target_column]) > threshold].index
    print(relevant_features)
    data_selected = data_numeric[relevant_features]

    # Display the selected data
    print(data_selected.head(10))

## 5. Summary

In this notebook, we performed the following steps for data analysis and preprocessing:
1. Loaded the dataset and displayed the first ten instances.
2. Provided key statistical metrics including mean and standard deviation.
3. Visualized numerical columns using bar charts, histograms, and box plots.
4. Handled missing values by removing instances with missing data.
5. Removed outliers using the interquartile range (IQR) method.
6. Normalized the numerical columns to standardize the feature values.
7. Computed and visualized the correlation matrix to identify relevant features.
8. Selected the most relevant features based on a correlation threshold of 0.5.
These steps helped us clean the data and prepare it for machine learning model training and evaluation. We found interesting trends and patterns during the analysis, such as the strong correlation between certain numerical columns and the target variable. These insights will guide our feature selection and model development process.
