# Machine Learning Model

In [1]:
# Initial imports
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Load the ventas_linea.csv dataset.
ventas_df = pd.read_csv('./FinalProject/almacenes_ventas.csv')
ventas_df

Unnamed: 0,product_id,description,date,name,qty,price_unit
0,26078,XIAOMI REDMI-NOTE-11S 6/128 NGO,2023-02-24,ALLENDE (QRO),1,4568.110000
1,26579,MOTO G41 4/128 DORADO,2023-02-24,ALLENDE (QRO),1,3620.680000
2,26579,MOTO G41 4/128 DORADO,2023-02-24,ALLENDE (QRO),1,3619.827500
3,26585,MOTO G22 4/128 NEGRO,2023-02-24,PLAZA MAYOR (LEON),1,3878.450000
4,25565,MOTO G60S 6/128 AZUL,2023-02-24,MADERO (LEON),1,4481.896552
...,...,...,...,...,...,...
34097,26053,ZTE L8 1/32 NEGRO,2023-02-24,CORREGIDORA 3 (QRO),1,1033.620000
34098,31000,APPLE IPHONE-11 4/64 BLANCO KIT,2023-02-24,PLAZA PATIO (QRO),1,9481.900000
34099,26578,MOTO G41 4/128 NEGRO,2023-02-24,ALLENDE (QRO),1,3275.000000
34100,25533,MOTO E20 2/32 AZUL,2023-02-24,PLAZA PATIO (QRO),1,1723.280000


In [3]:
# Convert the date column to datetime and set it as the index
ventas_df['date'] = pd.to_datetime(ventas_df['date'])
ventas_df.set_index('date', inplace=True)

# Split the 'description' column into multiple columns
ventas_df[['brand', 'model', 'details']] = ventas_df['description'].str.split(' ', n=2, expand=True)

# Compute the total sales column
ventas_df['total_sales'] = ventas_df['qty'] * ventas_df['price_unit']

# Group the data frame by name and month, and sum the qty and total sales columns
final_ventas_df = ventas_df.groupby([pd.Grouper(freq='M'), 'name', 'brand']).agg({'qty': 'sum', 'price_unit':'mean', 'total_sales': 'sum'})

# Reset the index to make the month, name, and description columns
final_ventas_df = final_ventas_df.reset_index()

# Format the month column as month names
final_ventas_df['date'] = final_ventas_df['date'].dt.strftime('%B')

# Rename the columns
final_ventas_df = final_ventas_df.rename(columns={'date': 'month'})

# Set the display option to show numbers without scientific notation
pd.options.display.float_format = '{:,.2f}'.format

# Print the summary data frame
final_ventas_df = final_ventas_df[['name', 'brand', 'month', 'qty', 'price_unit']].sort_values('qty', ascending=False)
final_ventas_df

Unnamed: 0,name,brand,month,qty,price_unit
2547,ALLENDE (QRO),SAM,December,253,3575.06
2911,ALLENDE (QRO),SAM,January,237,3510.55
2909,ALLENDE (QRO),MOTO,January,208,2858.29
2745,LEON CENTRO (LEON),SAM,December,196,2987.60
2227,ALLENDE (QRO),SAM,November,190,3891.20
...,...,...,...,...,...
2442,PARQUE (CELAYA),NOKIA,November,1,430.17
874,PARQUE (CELAYA),HUAWEI,June,1,4137.07
2440,PARQUE (CELAYA),HUAWEI,November,1,4481.90
876,PARQUE (CELAYA),LANIX,June,1,1464.66


In [4]:
# Define a list of words that represent devices
accesories_words = ['ROUTER']

# Check which rows contain any of the accesories-related words
is_device = final_ventas_df['brand'].str.contains('|'.join(accesories_words))

# Invert the boolean values to select only the rows where no accesories-related words are present
final_ventas_devices_df = final_ventas_df[~is_device]
final_ventas_devices_df

Unnamed: 0,name,brand,month,qty,price_unit
2547,ALLENDE (QRO),SAM,December,253,3575.06
2911,ALLENDE (QRO),SAM,January,237,3510.55
2909,ALLENDE (QRO),MOTO,January,208,2858.29
2745,LEON CENTRO (LEON),SAM,December,196,2987.60
2227,ALLENDE (QRO),SAM,November,190,3891.20
...,...,...,...,...,...
2442,PARQUE (CELAYA),NOKIA,November,1,430.17
874,PARQUE (CELAYA),HUAWEI,June,1,4137.07
2440,PARQUE (CELAYA),HUAWEI,November,1,4481.90
876,PARQUE (CELAYA),LANIX,June,1,1464.66


In [5]:
final_ventas_devices_df = pd.get_dummies(final_ventas_devices_df, columns=['name', 'brand', 'month'])

# Creation of dataframe for the Data segmentation and the model training
trials_df = final_ventas_devices_df
trials_df = trials_df.reset_index(drop=True)
trials_df

Unnamed: 0,qty,price_unit,name_ALAMEDA (QRO),name_ALLENDE (CELAYA),name_ALLENDE (QRO),name_ANDADOR (CELAYA),name_ANTEA (QRO),name_BOULEVARES (QRO),name_CONSTITUYENTES (QRO),name_CORREGIDORA 2 (QRO),...,month_August,month_December,month_February,month_January,month_July,month_June,month_May,month_November,month_October,month_September
0,253,3575.06,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,237,3510.55,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,208,2858.29,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,196,2987.60,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,190,3891.20,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3461,1,430.17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3462,1,4137.07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3463,1,4481.90,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3464,1,1464.66,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Data segmentation

In [6]:
# splitting the DataFrame
X = trials_df.drop(columns='qty')
y = pd.DataFrame(trials_df['qty'])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

## Linear Regression Model

In [8]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [9]:
# Model training 
model.fit(X_train, y_train)

LinearRegression()

In [10]:
# Model application 
y_pred = model.predict(X_test)
df = pd.DataFrame(y_pred)
df['y_pred'] = df
df['y_test'] = pd.DataFrame(y_test)
df = df.drop(columns=0)
df

Unnamed: 0,y_pred,y_test
0,17.69,23
1,28.29,22
2,4.12,4
3,20.51,5
4,-5.75,3
...,...,...
862,15.15,8
863,37.80,44
864,20.61,24
865,34.02,55


In [11]:
# Model evaluation using r2 Score
r2Score_LinearRegression = model.score(X_test, y_test) * 100
print(f'r2 Score for Linear Regression: {r2Score_LinearRegression}%')

r2 Score for Linear Regression: 47.46828161168527%


### Logistic Regression

In [12]:
# Logistic Regression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [37]:
from sklearn.linear_model import LogisticRegression
LogisticR = LogisticRegression(solver='newton-cg', random_state=1)
LogisticR

LogisticRegression(random_state=1, solver='newton-cg')

In [38]:
LogisticR.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(random_state=1, solver='newton-cg')

In [31]:
y_pred = LogisticR.predict(X_test)
print(y_pred.shape)
df_LogReg = pd.DataFrame(y_pred)
df_LogReg['y_pred'] = df_LogReg
df_LogReg['y_test'] = pd.DataFrame(y_test)
df_LogReg = df_LogReg.drop(columns=0)
df_LogReg

(867,)


Unnamed: 0,y_pred,y_test
0,1,23
1,1,22
2,1,4
3,1,5
4,1,3
...,...,...
862,1,8
863,1,44
864,1,24
865,1,55


In [32]:
# Model evaluation using r2 Score
r2Score_LogisticRegression = LogisticR.score(X_test,y_test) * 100
print(f'r2 Score for Linear Regression: {r2Score_LogisticRegression}%')

r2 Score for Linear Regression: 25.951557093425603%
