<a href="https://colab.research.google.com/github/Joh-Ishimwe/Data-Preprocessing/blob/master/Formative_2_Data_Preprocessing_Assignment_for_Machine_Learning_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Formative 2: Data Preprocessing Assignment for Machine Learning Pipeline
**Team Members**:  

*   Liliane Kayitesi
*   Ines Ikirezi
*   Josiane Ishimwe


**Group Number**: 7  

---
## Part 1: Data Augmentation on CSV Files


# Load the dataset

In [1]:
import numpy as np
import pandas as pd

# Load the dataset
path = '/content/customer_transactions.csv'
df = pd.read_csv(path)

#Step 2: Data Cleaning & Handling Missing Values

In [2]:
# Check for missing values
print(df.isnull().sum())

customer_id_legacy     0
transaction_id         0
purchase_amount        0
purchase_date          0
product_category       0
customer_rating       10
dtype: int64


In [3]:
# Fill missing customer_rating values with median imputation:
df['customer_rating'].fillna(df['customer_rating'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_rating'].fillna(df['customer_rating'].median(), inplace=True)


In [4]:
# Check for missing values again
print(df.isnull().sum())

customer_id_legacy    0
transaction_id        0
purchase_amount       0
purchase_date         0
product_category      0
customer_rating       0
dtype: int64


#Step 3: Data Augmentation Strategies

In [5]:
# 1. Add random noise to purchase_amount
noise = np.random.normal(0, 10, size=len(df))
df['purchase_amount'] = df['purchase_amount'] + noise

In [6]:
# 2. Apply log transformation to purchase_amount
df['log_purchase_amount'] = np.log1p(df['purchase_amount'])

In [7]:
# Create a binary target column (e.g., high vs. low purchase amount)
df['target'] = (df['purchase_amount'] > df['purchase_amount'].median()).astype(int)

# Check the distribution of the target variable
print(df['target'].value_counts())

target
1    75
0    75
Name: count, dtype: int64


#Synthetic Data Generation

In [8]:
# 3. Synthetic Data Generation
def generate_synthetic_data(df, num_samples=100):
    synthetic_data = df.sample(n=num_samples, replace=True)
    # Add variations (adjust as needed)
    synthetic_data['purchase_amount'] *= np.random.uniform(0.9, 1.1, size=num_samples)
    synthetic_data['product_category'] = np.random.choice(df['product_category'].unique(), size=num_samples)
    # Ensure 'purchase_date' is datetime before adding timedelta
    synthetic_data['purchase_date'] = pd.to_datetime(synthetic_data['purchase_date'], errors='coerce')
    synthetic_data['purchase_date'] = synthetic_data['purchase_date'] + pd.to_timedelta(np.random.randint(-3, 3, size=num_samples), unit='days')
    synthetic_data['customer_rating'] += np.random.uniform(-0.2, 0.2, size=num_samples)

    # Clip customer_rating to be within the original range
    synthetic_data['customer_rating'] = synthetic_data['customer_rating'].clip(lower=df['customer_rating'].min(), upper=df['customer_rating'].max())
    return synthetic_data

synthetic_data = generate_synthetic_data(df)
df_augmented = pd.concat([df, synthetic_data], ignore_index=True)

#Feature Engineering

In [9]:
# 1. Extract purchase_month
df_augmented['purchase_month'] = pd.to_datetime(df_augmented['purchase_date']).dt.month

# 2. Calculate avg_purchase_amount per customer
avg_purchase = df_augmented.groupby('customer_id_legacy')['purchase_amount'].mean().reset_index()
avg_purchase.columns = ['customer_id_legacy', 'avg_purchase_amount']
df_augmented = pd.merge(df_augmented, avg_purchase, on='customer_id_legacy', how='left')

In [10]:
# 3. Calculate days_since_last_purchase
df_augmented.sort_values(by=['customer_id_legacy', 'purchase_date'], inplace=True)
df_augmented['purchase_date'] = pd.to_datetime(df_augmented['purchase_date'], errors='coerce')
df_augmented.dropna(subset=['purchase_date'], inplace=True)
df_augmented['days_since_last_purchase'] = df_augmented.groupby('customer_id_legacy')['purchase_date'].diff().dt.days
df_augmented['days_since_last_purchase'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_augmented['days_since_last_purchase'].fillna(0, inplace=True)


#Export the Augmented Dataset

In [11]:
# Save the augmented dataset
df_augmented.to_csv('customer_transactions_augmented.csv', index=False)

#Part 2 Merging Datasets with Transitive Properties

In [12]:
import pandas as pd


transactions_df = pd.read_csv('customer_transactions_augmented.csv')
profiles_df = pd.read_csv('customer_social_profiles.csv')
mapping_df = pd.read_csv('id_mapping.csv')

print("Transactions DataFrame:")
print(transactions_df.head())
print("\nProfiles DataFrame:")
print(profiles_df.head())
print("\nMapping DataFrame:")
print(mapping_df.head())

Transactions DataFrame:
   customer_id_legacy  transaction_id  purchase_amount purchase_date  \
0                 100            1147       380.875809    2024-05-23   
1                 100            1147       438.797227    2024-05-26   
2                 100            1113       161.031026    2024-04-22   
3                 100            1147       402.142240    2024-05-26   
4                 101            1021       184.970380    2024-01-19   

  product_category  customer_rating  log_purchase_amount  target  \
0         Clothing         4.428340             5.999289       1   
1            Books         4.427431             5.999289       1   
2         Clothing         4.000000             5.087788       0   
3            Books         4.600000             5.999289       1   
4      Electronics         3.480945             5.230147       0   

   purchase_month  avg_purchase_amount  days_since_last_purchase  
0               5           345.711576                       0.0  


#Understand the ID Mapping


In [13]:
print("\nMapping DataFrame Info:")
print(mapping_df.info())
print("\nUnique Legacy IDs in Mapping:", mapping_df['customer_id_legacy'].nunique())
print("Unique New IDs in Mapping:", mapping_df['customer_id_new'].nunique())


Mapping DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id_legacy  155 non-null    int64 
 1   customer_id_new     155 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.6+ KB
None

Unique Legacy IDs in Mapping: 79
Unique New IDs in Mapping: 73


#Merge transactions with mapping

In [14]:
merged_transactions_mapping = pd.merge(transactions_df, mapping_df, on='customer_id_legacy', how='left')
print("\nMerged Transactions and Mapping DataFrame:")
print(merged_transactions_mapping.head())


Merged Transactions and Mapping DataFrame:
   customer_id_legacy  transaction_id  purchase_amount purchase_date  \
0                 100            1147       380.875809    2024-05-23   
1                 100            1147       438.797227    2024-05-26   
2                 100            1113       161.031026    2024-04-22   
3                 100            1147       402.142240    2024-05-26   
4                 101            1021       184.970380    2024-01-19   

  product_category  customer_rating  log_purchase_amount  target  \
0         Clothing         4.428340             5.999289       1   
1            Books         4.427431             5.999289       1   
2         Clothing         4.000000             5.087788       0   
3            Books         4.600000             5.999289       1   
4      Electronics         3.480945             5.230147       0   

   purchase_month  avg_purchase_amount  days_since_last_purchase  \
0               5           345.711576        

# Merge the result with profiles

In [15]:
final_merged_df = pd.merge(merged_transactions_mapping, profiles_df, on='customer_id_new', how='left')
print("\nFinal Merged DataFrame:")
print(final_merged_df.head())


Final Merged DataFrame:
   customer_id_legacy  transaction_id  purchase_amount purchase_date  \
0                 100            1147       380.875809    2024-05-23   
1                 100            1147       438.797227    2024-05-26   
2                 100            1113       161.031026    2024-04-22   
3                 100            1147       402.142240    2024-05-26   
4                 101            1021       184.970380    2024-01-19   

  product_category  customer_rating  log_purchase_amount  target  \
0         Clothing         4.428340             5.999289       1   
1            Books         4.427431             5.999289       1   
2         Clothing         4.000000             5.087788       0   
3            Books         4.600000             5.999289       1   
4      Electronics         3.480945             5.230147       0   

   purchase_month  avg_purchase_amount  days_since_last_purchase  \
0               5           345.711576                       0.0 

# Handle Conflicts

In [16]:
# This is to check if any other issues caused duplicates.
if 'transaction_id' in final_merged_df.columns:
    final_duplicates = final_merged_df[final_merged_df.duplicated(subset=['transaction_id'], keep=False)]
    if not final_duplicates.empty:
        print("\nPotential duplicates in final_merged_df based on transaction_id (after handling mapping):")
        print(final_duplicates.head())
        print(f"Total potential duplicates: {len(final_duplicates)}")
    else:
        print("\nNo duplicates found in final_merged_df based on transaction_id after handling mapping conflicts.")
else:
    print("\nWarning: 'transaction_id' column not found, cannot check for duplicates based on it.")


Potential duplicates in final_merged_df based on transaction_id (after handling mapping):
   customer_id_legacy  transaction_id  purchase_amount purchase_date  \
0                 100            1147       380.875809    2024-05-23   
1                 100            1147       438.797227    2024-05-26   
3                 100            1147       402.142240    2024-05-26   
4                 101            1021       184.970380    2024-01-19   
5                 101            1021       184.970380    2024-01-19   

  product_category  customer_rating  log_purchase_amount  target  \
0         Clothing         4.428340             5.999289       1   
1            Books         4.427431             5.999289       1   
3            Books         4.600000             5.999289       1   
4      Electronics         3.480945             5.230147       0   
5      Electronics         3.480945             5.230147       0   

   purchase_month  avg_purchase_amount  days_since_last_purchase  \

# Customer Engagement Score

In [17]:
print("\nDataFrame after potential Engagement Score creation:")
print(final_merged_df.head())


DataFrame after potential Engagement Score creation:
   customer_id_legacy  transaction_id  purchase_amount purchase_date  \
0                 100            1147       380.875809    2024-05-23   
1                 100            1147       438.797227    2024-05-26   
2                 100            1113       161.031026    2024-04-22   
3                 100            1147       402.142240    2024-05-26   
4                 101            1021       184.970380    2024-01-19   

  product_category  customer_rating  log_purchase_amount  target  \
0         Clothing         4.428340             5.999289       1   
1            Books         4.427431             5.999289       1   
2         Clothing         4.000000             5.087788       0   
3            Books         4.600000             5.999289       1   
4      Electronics         3.480945             5.230147       0   

   purchase_month  avg_purchase_amount  days_since_last_purchase  \
0               5           345.7115

# Engineer predictive behavioral features

In [18]:
#Moving Averages of Transactions (Adapted)

import pandas as pd

if 'purchase_date' in transactions_df.columns and 'purchase_amount' in transactions_df.columns:
    # Convert purchase_date to datetime
    transactions_df['purchase_date'] = pd.to_datetime(transactions_df['purchase_date'])

    # Sort transactions by customer and date
    transactions_df = transactions_df.sort_values(by=['customer_id_legacy', 'purchase_date'])

    # Set 'purchase_date' as the index for the rolling calculation
    transactions_df = transactions_df.set_index('purchase_date')

    # Define time windows for moving averages (e.g., 7 days, 30 days).
    time_windows = ['7D', '30D']

    for window in time_windows:
        # Calculate rolling mean of purchase amount
        transactions_df[f'purchase_amount_rolling_{window}'] = transactions_df.groupby('customer_id_legacy')['purchase_amount'].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)

    # Reset the index if you need 'purchase_date' as a column later
    transactions_df = transactions_df.reset_index()

    print("\nTransactions DataFrame with Moving Averages:")
    print(transactions_df.head())
else:
    print("Warning: 'purchase_date' or 'purchase_amount' column not found in transactions_df. Skipping Moving Averages.")




Transactions DataFrame with Moving Averages:
  purchase_date  customer_id_legacy  transaction_id  purchase_amount  \
0    2024-04-22                 100            1113       161.031026   
1    2024-05-23                 100            1147       380.875809   
2    2024-05-26                 100            1147       438.797227   
3    2024-05-26                 100            1147       402.142240   
4    2024-01-17                 101            1017       271.942110   

  product_category  customer_rating  log_purchase_amount  target  \
0         Clothing         4.000000             5.087788       0   
1         Clothing         4.428340             5.999289       1   
2            Books         4.427431             5.999289       1   
3            Books         4.600000             5.999289       1   
4            Books         2.100000             5.609260       0   

   purchase_month  avg_purchase_amount  days_since_last_purchase  \
0               4           345.711576      

In [23]:
#Time-based Aggregation of Purchases (Adapted)
if 'purchase_date' in transactions_df.columns and 'purchase_amount' in transactions_df.columns:
    # Convert purchase_date to datetime (if not already done)
    transactions_df['purchase_date'] = pd.to_datetime(transactions_df['purchase_date'])

    # Define a reference date (e.g., the latest purchase date in the dataset)
    reference_date = transactions_df['purchase_date'].max()

    # Calculate time differences
    transactions_df['days_since_purchase'] = (reference_date - transactions_df['purchase_date']).dt.days

    # Aggregate features per customer
    aggregated_purchases = transactions_df.groupby('customer_id_legacy').agg(
        total_purchase_amount=('purchase_amount', 'sum'),
        number_of_transactions=('transaction_id', 'nunique'),
        average_purchase_value=('purchase_amount', 'mean'),
        last_purchase_days=('days_since_purchase', 'min')
    ).reset_index()

    print("\nAggregated Purchase Features per Customer:")
    print(aggregated_purchases.head())

    # Merge aggregated features into final_merged_df
    final_merged_df = pd.merge(final_merged_df, aggregated_purchases, on='customer_id_legacy', how='left')
else:
    print("Warning: 'purchase_date' or 'purchase_amount' column not found in transactions_df. Skipping Time-based Aggregation.")


Aggregated Purchase Features per Customer:
   customer_id_legacy  total_purchase_amount  number_of_transactions  \
0                 100            1382.846303                       2   
1                 101            1530.644541                       4   
2                 102            1189.774113                       3   
3                 103            1237.305302                       3   
4                 104             711.435280                       2   

   average_purchase_value  last_purchase_days  
0              345.711576                   3  
1              218.663506                  77  
2              198.295686                   4  
3              309.326325                  74  
4              355.717640                   2  


In [24]:
#TF-IDF on Customer Reviews or Social Media Comments

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
if 'social_media_text' in profiles_df.columns:
    text_column = 'social_media_text'
    text_data = profiles_df[['customer_id_new', text_column]].dropna()

    if not text_data.empty:
        # Initialize TF-IDF Vectorizer
        tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')

        # Fit and transform the text data
        tfidf_matrix = tfidf_vectorizer.fit_transform(text_data[text_column])

        # Get feature names (words)
        feature_names = tfidf_vectorizer.get_feature_names_out()

        # Create a DataFrame of TF-IDF features
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{word}' for word in feature_names])

        # Concatenate with customer IDs (customer_id_new in this case)
        tfidf_df = pd.concat([text_data['customer_id_new'].reset_index(drop=True), tfidf_df], axis=1)

        # Aggregate TF-IDF features by customer (e.g., taking the mean)
        tfidf_aggregated = tfidf_df.groupby('customer_id_new').mean().reset_index()

        print("\nAggregated TF-IDF Features from Social Media Text:")
        print(tfidf_aggregated.head())

        # Merge TF-IDF features into final_merged_df using customer_id_new
        final_merged_df = pd.merge(final_merged_df, tfidf_aggregated, on='customer_id_new', how='left')
    else:
        print(f"No non-missing values found in the '{text_column}' column of profiles_df.")

else:
    print("Warning: 'social_media_text' column not found in profiles_df. Skipping TF-IDF.")



#Export the Final Preprocessed Data

In [25]:
group_number = "7"
output_filename = f'final_customer_data_{7}.csv'
final_merged_df.to_csv(output_filename, index=False)
print(f"\nFinal preprocessed data saved to: {output_filename}")


Final preprocessed data saved to: final_customer_data_7.csv
