Machine Learning Project -
Step1 : Data Pre-Processing

In [69]:
#Importing numpy and pandas libraries for data manipulation
import numpy as np
import pandas as pd

In [70]:
#Location of the dataset
filepath = 'Dataset/sales.csv'

In [71]:
#Loading the data in a dataframe
df = pd.read_csv(filepath) 

In [72]:
# Display the first five rows of the dataset
df.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,,300,320,340
1,,350,360,380
2,four,400,420,440
3,five,450,460,480
4,five,500,520,540


In [73]:
# Display the shape of the dataset
df.shape    

(20, 4)

In [74]:
#Checking for null values in the dataset
df.isnull().sum()

rate                     2
sales_in_first_month     0
sales_in_second_month    0
sales_in_third_month     0
dtype: int64

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   rate                   18 non-null     object
 1   sales_in_first_month   20 non-null     int64 
 2   sales_in_second_month  20 non-null     int64 
 3   sales_in_third_month   20 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 772.0+ bytes


In [76]:
# mean of all numerical columns
df.mean(numeric_only=True)

sales_in_first_month     775.0
sales_in_second_month    790.0
sales_in_third_month     810.0
dtype: float64

In [77]:
# Filling missing values with mean of respective columns
df['rate'].fillna(0, inplace=True)  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rate'].fillna(0, inplace=True)


In [78]:
df.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,300,320,340
1,0,350,360,380
2,four,400,420,440
3,five,450,460,480
4,five,500,520,540


In [79]:
df['sales_in_third_month'] = df['sales_in_third_month'] + np.random.randint(-30, 30, size=len(df))

Step 2 - Feature selection

In [80]:
# Feature Selection
X = df.iloc[:, :3]
y = df.iloc[:, -1]

In [81]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,0,300,320
1,0,350,360
2,four,400,420
3,five,450,460
4,five,500,520


In [82]:
y.head()    

0    340
1    350
2    467
3    458
4    529
Name: sales_in_third_month, dtype: int64

Step 3 - Convert Words to Number

In [83]:
# Convert Words to Numbers

def convert_to_int(word):
    word_dict = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8, 'nine':9, 'ten':10,'eleven':11, 'twelve':12,'zero':0,'0':0}
    return word_dict[word]


In [84]:
X['rate'] = X['rate'].apply(lambda x: convert_to_int(str(x)))

In [85]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,0,300,320
1,0,350,360
2,4,400,420
3,5,450,460
4,5,500,520


Step 4 - Concatenating X and y

In [86]:
# Concatenate
df1 = pd.concat([X, y], axis=1)
df1.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,300,320,340
1,0,350,360,350
2,4,400,420,467
3,5,450,460,458
4,5,500,520,529


In [87]:
df1.dtypes

rate                     int64
sales_in_first_month     int64
sales_in_second_month    int64
sales_in_third_month     int64
dtype: object

In [88]:
# Saving the cleaned data to a new CSV file
df1.to_csv('Dataset/cleaned_sales.csv')

Step 5 - Train Model and Check Accuracy

In [89]:
# Fitting the model
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X, y)
clf.score(X, y)

0.9968047626987854

Step 6 - Dump and test a Model

In [90]:
# saving the model
import pickle
pickle.dump(clf, open('model.pkl', 'wb'))

In [91]:
# making prediction
model = pickle.load(open('model.pkl', 'rb'))
print(model.predict([[4, 300, 500]]))

[666.30899339]


