In [1]:
import pandas as pd
import numpy as np
import os

In [10]:
data_path = r"C:/Users/hetpa/OneDrive/Desktop/AIML/AI Agent Sales/Data/raw/Year 2009-2010.csv"

# Load dataset with encoding fix
df = pd.read_csv(data_path, encoding='ISO-8859-1')

print(f"Loaded dataset with {len(df)} records.")
df.head()


Loaded dataset with 525461 records.


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/1/2009 7:45,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,12/1/2009 7:45,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,12/1/2009 7:45,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,12/1/2009 7:45,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,12/1/2009 7:45,1.25,13085.0,United Kingdom


In [11]:
# Remove invalid entries
df = df.dropna(subset=['Customer ID', 'InvoiceDate', 'Quantity', 'Price'])
df = df[(df['Quantity'] > 0) & (df['Price'] > 0)]
df = df.reset_index(drop=True)
print(df.shape)
df.head()

(407664, 8)


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/1/2009 7:45,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,12/1/2009 7:45,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,12/1/2009 7:45,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,12/1/2009 7:45,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,12/1/2009 7:45,1.25,13085.0,United Kingdom


In [12]:
# --- Feature Mapping ---
df['Deal Amount'] = df['Quantity'] * df['Price']
df['Last Contact Date'] = pd.to_datetime(df['InvoiceDate'])
df['Industry'] = df['Country']

# --- Simulate fields not present ---
df['Sales Stage'] = np.random.choice(['Contacted', 'Negotiation', 'Proposal Sent'], size=len(df))
df['Lead Source'] = np.random.choice(['Website', 'Referral', 'Social Media', 'Event'], size=len(df))
df['Emails'] = np.random.randint(1, 10, size=len(df))
df['Meetings'] = np.random.randint(0, 5, size=len(df))
df['Deal Status'] = np.random.choice([0,1], size=len(df), p=[0.7, 0.3])  # 70% negative class

df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Deal Amount,Last Contact Date,Industry,Sales Stage,Lead Source,Emails,Meetings,Deal Status
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/1/2009 7:45,6.95,13085.0,United Kingdom,83.4,2009-12-01 07:45:00,United Kingdom,Contacted,Event,7,4,0
1,489434,79323P,PINK CHERRY LIGHTS,12,12/1/2009 7:45,6.75,13085.0,United Kingdom,81.0,2009-12-01 07:45:00,United Kingdom,Contacted,Social Media,8,4,0
2,489434,79323W,WHITE CHERRY LIGHTS,12,12/1/2009 7:45,6.75,13085.0,United Kingdom,81.0,2009-12-01 07:45:00,United Kingdom,Contacted,Event,9,2,0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,12/1/2009 7:45,2.1,13085.0,United Kingdom,100.8,2009-12-01 07:45:00,United Kingdom,Negotiation,Social Media,9,2,0
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,12/1/2009 7:45,1.25,13085.0,United Kingdom,30.0,2009-12-01 07:45:00,United Kingdom,Negotiation,Referral,9,3,1


In [13]:
crm_df = df[['Industry', 'Sales Stage', 'Lead Source', 'Deal Amount', 
             'Last Contact Date', 'Emails', 'Meetings', 'Deal Status']]

print("Final shape:", crm_df.shape)
crm_df.head()

Final shape: (407664, 8)


Unnamed: 0,Industry,Sales Stage,Lead Source,Deal Amount,Last Contact Date,Emails,Meetings,Deal Status
0,United Kingdom,Contacted,Event,83.4,2009-12-01 07:45:00,7,4,0
1,United Kingdom,Contacted,Social Media,81.0,2009-12-01 07:45:00,8,4,0
2,United Kingdom,Contacted,Event,81.0,2009-12-01 07:45:00,9,2,0
3,United Kingdom,Negotiation,Social Media,100.8,2009-12-01 07:45:00,9,2,0
4,United Kingdom,Negotiation,Referral,30.0,2009-12-01 07:45:00,9,3,1


In [14]:
# Make sure processed folder exists
os.makedirs(r"C:/Users/hetpa/OneDrive/Desktop/AIML/AI Agent Sales/Data/processes data", exist_ok=True)

# Save as CSV
crm_df.to_csv("C:/Users/hetpa/OneDrive/Desktop/AIML/AI Agent Sales/Data/processes data/converted_sales_data.csv", index=False)

print("✅ Converted dataset saved successfully")


✅ Converted dataset saved successfully
