<a href="https://colab.research.google.com/github/Joh-Ishimwe/Data-Preprocessing/blob/master/Part_3_Data_Consistency_and_Quality_Checks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Integrity Checks

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

# Load the merged dataset from Part 2
df_final = pd.read_csv('final_customer_data_7.csv')


In [None]:
# 1. Check for duplicate entries
duplicates = df_final.duplicated().sum()
print(f"Number of duplicate entries: {duplicates}")
if duplicates > 0:
    df_final = df_final.drop_duplicates()
    print("Duplicates removed.")

# 2. Ensure categorical values are correctly mapped
categorical_columns = ['product_category', 'social_media_platform', 'review_sentiment']
for col in categorical_columns:
    unique_values = df_final[col].unique()
    print(f"Unique values in {col}: {unique_values}")

# 3. Validate customer transactions match a valid social profile
# Check if customer_id_new has corresponding social_media_platform when not NaN
mismatched = df_final[(df_final['customer_id_new'].notna()) & (df_final['social_media_platform'].isna())]
print(f"Transactions with customer_id_new but no social_media_platform: {len(mismatched)}")
if len(mismatched) > 0:
    print("Sample mismatched rows:")
    print(mismatched[['customer_id_legacy', 'customer_id_new', 'social_media_platform']].head())

# 2. Statistical Summarization

In [None]:
# Generate describe() reports for numerical columns
numerical_columns = df_final.select_dtypes(include=[np.number]).columns
print("\nStatistical Summary of Numerical Columns:")
print(df_final[numerical_columns].describe())

# Visualize distribution of transaction amounts before and after augmentation
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df_final['purchase_amount'], bins=30, kde=True)
plt.title('Distribution of Original Purchase Amount')

plt.subplot(1, 2, 2)
sns.histplot(df_final['purchase_amount_noisy'], bins=30, kde=True)
plt.title('Distribution of Noisy Purchase Amount')
plt.tight_layout()
plt.show()

# 3. Feature Selection for Machine Learning

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df_final[numerical_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

# Prepare features and target (predicting 'target' since it's present)
features_to_drop = ['target', 'purchase_date', 'customer_id_legacy', 'customer_id_new',
                    'transaction_id'] + categorical_columns
features_to_drop = [col for col in features_to_drop if col in df_final.columns]
features = df_final.drop(columns=features_to_drop)
X = features.fillna(0)  # Fill NaN for feature selection
y = df_final['target'].fillna(df_final['target'].mean())

# Select top 10 features
selector = SelectKBest(score_func=f_classif, k=10)
selector.fit(X, y)
selected_features = X.columns[selector.get_support()].tolist()

print("\nTop 10 Most Important Features:")
for i, (feature, score) in enumerate(zip(selected_features, selector.scores_[selector.get_support()]), 1):
    print(f"{i}. {feature}: {score:.2f}")

In [None]:
# Task 4: Final Data Export
final_columns = (['customer_id_legacy', 'customer_id_new', 'transaction_id', 'purchase_date'] +
                 categorical_columns + selected_features + ['target'])
final_columns = [col for col in final_columns if col in df_final.columns]
final_dataset = df_final[final_columns]

# Save the final dataset
final_dataset.to_csv('final_dataset_ready_7.csv', index=False)
print("\nFinal dataset saved as 'final_dataset_ready_7.csv'")