<a href="https://colab.research.google.com/github/John-Akech/Formative-2---Data-Preprocessing/blob/master/Part_1_Data_Augmentation_on_CSV_Files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning & Handling Missing Values

In [443]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from imblearn.over_sampling import SMOTE
import uuid
import datetime

# Step 1: Load the Dataset
try:
    # Load the dataset
    df = pd.read_csv('/content/customer_transactions.csv')
except FileNotFoundError:
    print("Error: The file 'customer_transactions.csv' was not found.")
    exit()

In [444]:
# Display basic information
print("\nDataset Overview:")
print(df.info())


Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id_legacy  150 non-null    int64  
 1   transaction_id      150 non-null    int64  
 2   purchase_amount     150 non-null    int64  
 3   purchase_date       150 non-null    object 
 4   product_category    150 non-null    object 
 5   customer_rating     140 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 7.2+ KB
None


In [445]:
# Display first few rows
print("\nFirst 5 Rows:")
print(df.head())


First 5 Rows:
   customer_id_legacy  transaction_id  purchase_amount purchase_date  \
0                 151            1001              408    2024-01-01   
1                 192            1002              332    2024-01-02   
2                 114            1003              442    2024-01-03   
3                 171            1004              256    2024-01-04   
4                 160            1005               64    2024-01-05   

  product_category  customer_rating  
0           Sports              2.3  
1      Electronics              4.2  
2      Electronics              2.1  
3         Clothing              2.8  
4         Clothing              1.3  


In [446]:
# Step 2: Ensure Correct Data Types
numerical_cols = ['purchase_amount', 'customer_rating']
categorical_cols = ['product_category']

In [447]:
# Convert purchase_date to datetime
df['purchase_date'] = pd.to_datetime(df['purchase_date'], errors='coerce')

In [448]:
# Extract year, month, and day from purchase_date
df['year'] = df['purchase_date'].dt.year
df['month'] = df['purchase_date'].dt.month
df['day'] = df['purchase_date'].dt.day

In [449]:
# Drop the original purchase_date column
df.drop(columns=['purchase_date'], inplace=True)

In [450]:
# Ensure numerical columns are numeric
for col in numerical_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [451]:
# Ensure categorical columns are category type
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [452]:
# Verify updated data types
print("\nUpdated Data Types:")
print(df.dtypes)


Updated Data Types:
customer_id_legacy       int64
transaction_id           int64
purchase_amount          int64
product_category      category
customer_rating        float64
year                     int32
month                    int32
day                      int32
dtype: object


In [453]:
# Step 3: Handle Missing Values
# Check for missing values
print("\nMissing Values Summary:")
print(df.isnull().sum())


Missing Values Summary:
customer_id_legacy     0
transaction_id         0
purchase_amount        0
product_category       0
customer_rating       10
year                   0
month                  0
day                    0
dtype: int64


In [454]:
# Impute numerical columns using median
imputer = SimpleImputer(strategy='median')
df[numerical_cols + ['year', 'month', 'day']] = imputer.fit_transform(df[numerical_cols + ['year', 'month', 'day']])

In [455]:
# Impute categorical columns using mode
for col in categorical_cols:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace=True)
    print(f"\nFilled missing values in {col} with mode: {mode_value}")


Filled missing values in product_category with mode: Sports


In [456]:
# Predictive modeling for remaining missing values in customer_rating
if df['customer_rating'].isnull().any():
    X_train = df[df['customer_rating'].notnull()][['purchase_amount']]
    y_train = df[df['customer_rating'].notnull()]['customer_rating']
    X_missing = df[df['customer_rating'].isnull()][['purchase_amount']]

    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    df.loc[df['customer_rating'].isnull(), 'customer_rating'] = regressor.predict(X_missing)

In [457]:
# Verify missing values are handled
print("\nMissing Values After Imputation:")
print(df.isnull().sum())


Missing Values After Imputation:
customer_id_legacy    0
transaction_id        0
purchase_amount       0
product_category      0
customer_rating       0
year                  0
month                 0
day                   0
dtype: int64


# Data Augmentation Strategies

In [458]:
# Step 4: Apply Random Noise to Numerical Columns
# Add random noise to purchase_amount
noise_factor = 0.05  # Adjust noise factor for better augmentation
original_std = df['purchase_amount'].std()
df['purchase_amount'] += np.random.normal(0, noise_factor * original_std, df.shape[0])

print("\nRandom Noise Applied to purchase_amount:")
print(df[['purchase_amount']].head())


Random Noise Applied to purchase_amount:
   purchase_amount
0       410.440595
1       323.515841
2       444.550128
3       261.623278
4        60.853957


In [459]:
# Step 5: Transform Skewed Features
# Check skewness
skewness = df['purchase_amount'].skew()
print(f"\nSkewness of purchase_amount: {skewness}")

# Apply log transformation if skewed
if skewness > 1:
    df['purchase_amount'] = np.log1p(df['purchase_amount'])
    print("\nLog Transformation Applied to purchase_amount.")

print("\nTransformed purchase_amount:")
print(df[['purchase_amount']].head())


Skewness of purchase_amount: 0.040473043333057894

Transformed purchase_amount:
   purchase_amount
0       410.440595
1       323.515841
2       444.550128
3       261.623278
4        60.853957


In [460]:
# Step 6: Generate Synthetic Data (Choose One Approach)

# Approach 1: Discretize Target Variable and Use SMOTE
def augment_with_smote(df):
    # Encode categorical variables
    X = df.drop(columns=['customer_id_legacy', 'transaction_id', 'customer_rating'])
    y = df['customer_rating']

    # One-hot encode categorical features
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    X_encoded = encoder.fit_transform(X[categorical_cols])
    X_encoded = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_cols))
    X_final = pd.concat([X.drop(columns=categorical_cols), X_encoded], axis=1)

    # Discretize the continuous target into bins
    discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    y_discrete = discretizer.fit_transform(y.values.reshape(-1, 1)).ravel()

    # Apply SMOTE to the discretized target
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_final, y_discrete)

    # Decode the discretized target back to the original scale
    y_resampled_continuous = discretizer.inverse_transform(y_resampled.reshape(-1, 1)).ravel()

    # Combine resampled data into a DataFrame
    synthetic_data = pd.DataFrame(X_resampled, columns=X_final.columns)
    synthetic_data['customer_rating'] = y_resampled_continuous

    # Decode one-hot-encoded features back to original format
    def decode_one_hot(encoded_df, original_df):
        decoded_df = pd.DataFrame(index=encoded_df.index)
        for col in categorical_cols:
            one_hot_cols = [c for c in encoded_df.columns if c.startswith(col)]
            decoded_df[col] = encoded_df[one_hot_cols].idxmax(axis=1).str.replace(f"{col}_", "")
        return decoded_df

    decoded_categoricals = decode_one_hot(synthetic_data, X)
    synthetic_data = pd.concat([synthetic_data.drop(columns=[c for c in synthetic_data.columns if any(cat in c for cat in categorical_cols)]), decoded_categoricals], axis=1)

    # Generate synthetic IDs
    synthetic_data['customer_id_legacy'] = [uuid.uuid4().int % 10**9 for _ in range(synthetic_data.shape[0])]
    synthetic_data['transaction_id'] = range(df['transaction_id'].max() + 1, df['transaction_id'].max() + 1 + synthetic_data.shape[0])

    return synthetic_data

In [461]:
# Approach 2: Interpolation for Continuous Target
def augment_with_interpolation(df):
    # Function to generate synthetic samples via interpolation
    def interpolate_data(X, y, n_samples):
        synthetic_X = []
        synthetic_y = []

        for _ in range(n_samples):
            # Randomly select two samples
            idx1, idx2 = np.random.choice(len(X), size=2, replace=False)
            alpha = np.random.uniform(0, 1)  # Interpolation factor

            # Interpolate features and target
            synthetic_X.append(alpha * X.iloc[idx1] + (1 - alpha) * X.iloc[idx2])
            synthetic_y.append(alpha * y.iloc[idx1] + (1 - alpha) * y.iloc[idx2])

        synthetic_X = pd.DataFrame(synthetic_X, columns=X.columns)
        synthetic_y = pd.Series(synthetic_y, name=y.name)
        return synthetic_X, synthetic_y

    # Prepare data for augmentation
    X = df.drop(columns=['customer_id_legacy', 'transaction_id', 'customer_rating'])
    y = df['customer_rating']

    # Generate synthetic data
    n_synthetic_samples = len(df)  # Generate as many synthetic samples as the original dataset
    synthetic_X, synthetic_y = interpolate_data(X, y, n_synthetic_samples)

    # Combine synthetic data into a DataFrame
    synthetic_data = pd.concat([synthetic_X, synthetic_y], axis=1)
    synthetic_data.columns = X.columns.tolist() + ['customer_rating']

    # Generate synthetic IDs
    synthetic_data['customer_id_legacy'] = [uuid.uuid4().int % 10**9 for _ in range(synthetic_data.shape[0])]
    synthetic_data['transaction_id'] = range(df['transaction_id'].max() + 1, df['transaction_id'].max() + 1 + synthetic_data.shape[0])

    return synthetic_data

In [462]:
# Choose an augmentation approach
augmentation_method = "smote"  # Change to "interpolation" if needed

if augmentation_method == "smote":
    synthetic_data = augment_with_smote(df)
elif augmentation_method == "interpolation":
    synthetic_data = augment_with_interpolation(df)

# Concatenate synthetic data with the original dataset
df_augmented = pd.concat([df, synthetic_data], axis=0).reset_index(drop=True)

print("\nSynthetic Data Generated:")
print(synthetic_data.head())


Synthetic Data Generated:
   purchase_amount    year  month  day  customer_rating product_category  \
0       410.440595  2024.0    1.0  1.0         1.666667           Sports   
1       323.515841  2024.0    1.0  2.0         4.333333      Electronics   
2       444.550128  2024.0    1.0  3.0         1.666667      Electronics   
3       261.623278  2024.0    1.0  4.0         3.000000         Clothing   
4        60.853957  2024.0    1.0  5.0         1.666667         Clothing   

   customer_id_legacy  transaction_id  
0            89990907            1151  
1           874511730            1152  
2           594087804            1153  
3           210559944            1154  
4           366967634            1155  


# Export the Augmented Data

In [463]:
# Export the Augmented Dataset
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = f'customer_transactions_augmented_{timestamp}.csv'
df_augmented.to_csv(output_file, index=False)

print(f"\nAugmented Dataset Saved Successfully as '{output_file}'!")


Augmented Dataset Saved Successfully as 'customer_transactions_augmented_20250316_174840.csv'!
