In [97]:
%pip install pandas scikit-learn matplotlib seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


1. load data

In [72]:
import pandas as pd
df = pd.read_csv('sales_regression.csv')
df.sales_amount = df.sales_amount.astype(int)
df.head()

Unnamed: 0,customer_id,age,gender,city,total_orders,avg_order_value,loyalty_points,days_since_last_order,is_subscribed,device_type,sales_amount
0,1,56,M,Tashkent,20,114110.0,194.0,89.0,1,Mobile,3259707
1,2,46,M,Tashkent,26,234626.0,329.0,66.0,0,Mobile,6760864
2,3,32,F,Andijan,4,202008.0,423.0,30.0,0,Desktop,1740624
3,4,25,F,Tashkent,46,166895.0,174.0,3.0,0,Desktop,7436054
4,5,38,M,Andijan,22,54426.0,489.0,21.0,0,Desktop,1104221


In [73]:
df.describe()

Unnamed: 0,customer_id,age,total_orders,avg_order_value,loyalty_points,days_since_last_order,is_subscribed,sales_amount
count,1030.0,1030.0,1030.0,979.0,979.0,979.0,1030.0,1030.0
mean,502.901942,38.707767,25.56699,175327.79571,257.427988,49.738509,0.478641,5063351.0
std,289.078041,12.172434,14.121398,72716.25927,139.46253,28.315327,0.499786,5971900.0
min,1.0,18.0,1.0,50869.0,0.0,1.0,0.0,-2307649.0
25%,252.25,28.0,13.0,113786.0,141.0,25.0,0.0,1998847.0
50%,505.5,40.0,26.0,176155.0,257.0,50.0,0.0,4030853.0
75%,752.75,49.75,38.0,237516.0,382.0,74.0,1.0,6993354.0
max,1000.0,59.0,49.0,299880.0,499.0,99.0,1.0,99429910.0


In [74]:
df.device_type.value_counts()

device_type
Desktop    538
Mobile     492
Name: count, dtype: int64

In [75]:
# outliers, errors
df = df[df.sales_amount > 0]

2. data cleaning

In [76]:
df.drop_duplicates(inplace=True)

In [77]:
df.isna().sum()

customer_id               0
age                       0
gender                    0
city                     45
total_orders              0
avg_order_value          47
loyalty_points           48
days_since_last_order    46
is_subscribed             0
device_type               0
sales_amount              0
dtype: int64

In [78]:
45 / 9.58


4.697286012526096

In [79]:
df.city.value_counts()

city
Andijan      196
Samarkand    195
Namangan     191
Tashkent     183
Bukhara      148
Name: count, dtype: int64

In [80]:
# city: dropna
df = df.dropna(subset=['city'])

In [81]:
df.avg_order_value.isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
997     False
998     False
999     False
1002    False
1025    False
Name: avg_order_value, Length: 913, dtype: bool

In [82]:
# Fill missing values with mean
df = df.assign(
    avg_order_value=df['avg_order_value'].fillna(df['avg_order_value'].mean()),
    loyalty_points=df['loyalty_points'].fillna(df['loyalty_points'].mean()),
    days_since_last_order=df['days_since_last_order'].fillna(df['days_since_last_order'].mean())
)

# Verify no more missing values
print("\nMissing values after filling with mean:")
print(df[['avg_order_value', 'loyalty_points', 'days_since_last_order']].isnull().sum())


Missing values after filling with mean:
avg_order_value          0
loyalty_points           0
days_since_last_order    0
dtype: int64


3. labelling

In [83]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Initialize encoders
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid dummy variable trap

# Encode gender (binary)
df['gender'] = label_encoder.fit_transform(df['gender'])  # M->0, F->1

# One-hot encode city
city_encoded = onehot_encoder.fit_transform(df[['city']])
city_columns = [f"city_{cat}" for cat in onehot_encoder.categories_[0][1:]]  # Get column names, drop first
df[city_columns] = city_encoded

# Encode device_type (binary)
df['device_type'] = label_encoder.fit_transform(df['device_type'])  # Mobile->0, Desktop->1

# Drop the original city column
df = df.drop('city', axis=1)

# Display the first few rows to verify the changes
print("\nFirst 5 rows after encoding:")
display(df.head())


First 5 rows after encoding:


Unnamed: 0,customer_id,age,gender,total_orders,avg_order_value,loyalty_points,days_since_last_order,is_subscribed,device_type,sales_amount,city_Bukhara,city_Namangan,city_Samarkand,city_Tashkent
0,1,56,1,20,114110.0,194.0,89.0,1,1,3259707,0.0,0.0,0.0,1.0
1,2,46,1,26,234626.0,329.0,66.0,0,1,6760864,0.0,0.0,0.0,1.0
2,3,32,0,4,202008.0,423.0,30.0,0,0,1740624,0.0,0.0,0.0,0.0
3,4,25,0,46,166895.0,174.0,3.0,0,0,7436054,0.0,0.0,0.0,1.0
4,5,38,1,22,54426.0,489.0,21.0,0,0,1104221,0.0,0.0,0.0,0.0


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 913 entries, 0 to 1025
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   customer_id            913 non-null    int64  
 1   age                    913 non-null    int64  
 2   gender                 913 non-null    int64  
 3   total_orders           913 non-null    int64  
 4   avg_order_value        913 non-null    float64
 5   loyalty_points         913 non-null    float64
 6   days_since_last_order  913 non-null    float64
 7   is_subscribed          913 non-null    int64  
 8   device_type            913 non-null    int64  
 9   sales_amount           913 non-null    int64  
 10  city_Bukhara           913 non-null    float64
 11  city_Namangan          913 non-null    float64
 12  city_Samarkand         913 non-null    float64
 13  city_Tashkent          913 non-null    float64
dtypes: float64(7), int64(7)
memory usage: 107.0 KB


4. split dataset

In [85]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('sales_amount', axis=1)  # All columns except the target
y = df['sales_amount']  # Target variable

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42  # For reproducibility
)

# Display the shapes of the resulting datasets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("\nTraining set target distribution:")
print(y_train.describe())
print("\nTesting set target distribution:")
print(y_test.describe())


Training set shape: (730, 13)
Testing set shape: (183, 13)

Training set target distribution:
count    7.300000e+02
mean     5.251677e+06
std      5.629806e+06
min      9.570000e+02
25%      2.323610e+06
50%      4.191384e+06
75%      7.069932e+06
max      9.942991e+07
Name: sales_amount, dtype: float64

Testing set target distribution:
count    1.830000e+02
mean     5.821582e+06
std      7.806790e+06
min      4.640200e+04
25%      2.423372e+06
50%      4.167418e+06
75%      7.345221e+06
max      8.172326e+07
Name: sales_amount, dtype: float64


5. fit train

In [89]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)




0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [90]:
# Make predictions
y_test_pred = model.predict(X_test)

y_test_pred

array([ 5.39507749e+06,  3.64884036e+06,  7.99511583e+06,  9.90075690e+06,
        3.43792099e+06,  7.01510678e+06,  6.42056641e+06,  4.67853180e+06,
        5.89342233e+06,  2.45368464e+06,  3.71420591e+06,  8.91846264e+06,
        4.81580186e+06,  6.78814436e+06,  1.07870454e+07,  2.94578571e+06,
        6.88013331e+06,  1.27710733e+06,  4.19192361e+06,  2.73308196e+06,
        4.83667286e+06,  3.04297033e+06,  2.88836563e+06,  6.50492626e+06,
        6.37283685e+06,  5.08708245e+06,  6.08463893e+06,  1.72022534e+06,
        6.96216437e+06,  7.72484799e+06,  3.90053407e+06,  5.51909771e+06,
        6.31188911e+05, -5.15753372e+05, -7.43011767e+05,  9.84689705e+06,
        5.64251584e+06,  4.18981299e+06,  8.01755716e+06, -1.36150350e+06,
        3.25862012e+06,  2.76427035e+06,  4.05251271e+06,  1.17731845e+07,
        9.22904610e+06,  3.39843911e+06, -3.87593072e+05,  1.17430127e+07,
        1.02113328e+07,  1.29803924e+07, -6.11172380e+05,  2.22165412e+06,
        2.91726593e+06,  

In [102]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score


print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("RMSE:", root_mean_squared_error(y_test, y_test_pred))
print("R2:", r2_score(y_test, y_test_pred))


MAE: 2155839.7715372145
RMSE: 7110976.429090736
R2: 0.16575571888396068
