In [10]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler

# 1. Load the data (which now has customer_id and event_timestamp from add_data.py)
df = pd.read_csv('/Users/khoale/Desktop/Customer-Churn-Project/data/telecom_churn.csv')


# 1b. Convert timestamp to datetime with UTC
df['event_timestamp'] = pd.to_datetime(df['event_timestamp'], utc=True, errors='coerce')
df = df.dropna(subset=['event_timestamp'])

# 2. Feature Engineering (Keep your existing logic)
df['MinsPerDayCall'] = df['DayMins'] / df['DayCalls'].replace(0, 1) 
df['UnderusingData'] = ((df['DataPlan'] == 1) & (df['DataUsage'] == 0)).astype(int)

# 3. Scaling
# We scale the numerical features but keep the IDs and Timestamps untouched
cols_to_scale = df.drop(['Churn', 'customer_id', 'event_timestamp'], axis=1).columns
scaler = StandardScaler()
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# 4. Save to Parquet for Feast
# We save this to the directory where Feast will look for it
os.makedirs('/Users/khoale/Desktop/Customer-Churn-Project/churn_feature_repo/feature_repo/data', exist_ok=True)
df.to_parquet('/Users/khoale/Desktop/Customer-Churn-Project/churn_feature_repo/feature_repo/data/processed_churn_data.parquet', index=False)

print("Processing complete! Saved whole dataset to churn_feature_repo/feature_repo/data/processed_churn_data.parquet")

Processing complete! Saved whole dataset to churn_feature_repo/feature_repo/data/processed_churn_data.parquet


In [11]:
import pandas as pd

df = pd.read_parquet("/Users/khoale/Desktop/Customer-Churn-Project/churn_feature_repo/feature_repo/data/processed_churn_data.parquet")

# Quick check of types
print(df.dtypes)

# Check first few rows
print(df.head())

# Look specifically at CustServCalls
print(df[['customer_id', 'CustServCalls']].head(10))


Churn                            int64
AccountWeeks                   float64
ContractRenewal                float64
DataPlan                       float64
DataUsage                      float64
CustServCalls                  float64
DayMins                        float64
DayCalls                       float64
MonthlyCharge                  float64
OverageFee                     float64
RoamMins                       float64
event_timestamp    datetime64[ns, UTC]
customer_id                      int64
MinsPerDayCall                 float64
UnderusingData                 float64
dtype: object
   Churn  AccountWeeks  ContractRenewal  DataPlan  DataUsage  CustServCalls  \
0      0      0.697467         0.328457  1.588215   1.430645      -0.428907   
1      0      0.177026         0.328457  1.588215   2.201015      -0.428907   
2      0      0.920514         0.328457 -0.629638  -0.649356      -1.191346   
3      0     -0.392981        -3.044541 -0.629638  -0.649356       0.333531   
4     