Machine Learning Project -
Step1 : Data Pre-Processing

In [1]:
#Importing numpy and pandas libraries for data manipulation
import numpy as np
import pandas as pd

In [2]:
#Location of the dataset
filepath = 'Dataset/sales.csv'

In [3]:
#Loading the data in a dataframe
df = pd.read_csv(filepath) 

In [4]:
# Display the first five rows of the dataset
df.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,,2,500,300
1,,4,300,650
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350


In [5]:
# Display the shape of the dataset
df.shape    

(6, 4)

In [6]:
#Checking for null values in the dataset
df.isnull().sum()

rate                     2
sales_in_first_month     0
sales_in_second_month    0
sales_in_third_month     0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   rate                   4 non-null      object
 1   sales_in_first_month   6 non-null      int64 
 2   sales_in_second_month  6 non-null      int64 
 3   sales_in_third_month   6 non-null      int64 
dtypes: int64(3), object(1)
memory usage: 324.0+ bytes


In [8]:
# mean of all numerical columns
df.mean(numeric_only=True)

sales_in_first_month     367.666667
sales_in_second_month    295.000000
sales_in_third_month     508.333333
dtype: float64

In [9]:
# Filling missing values with mean of respective columns
df['rate'].fillna(0, inplace=True)  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rate'].fillna(0, inplace=True)


In [10]:
df.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,2,500,300
1,0,4,300,650
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350


Step 2 - Feature selection

In [11]:
# Feature Selection
X = df.iloc[:, :3]
y = df.iloc[:, -1]

In [12]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,0,2,500
1,0,4,300
2,four,600,200
3,nine,450,320
4,seven,600,250


In [13]:
y.head()    

0    300
1    650
2    400
3    650
4    350
Name: sales_in_third_month, dtype: int64

Step 3 - Convert Words to Number

In [14]:
# Convert Words to Numbers

def convert_to_int(word):
    word_dict = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8, 'nine':9, 'ten':10,'eleven':11, 'twelve':12,'zero':0,'0':0}
    return word_dict[word]


In [15]:
X['rate'] = X['rate'].apply(lambda x: convert_to_int(str(x)))

In [16]:
X.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,0,2,500
1,0,4,300
2,4,600,200
3,9,450,320
4,7,600,250


Step 4 - Concatenating X and y

In [17]:
# Concatenate
df1 = pd.concat([X, y], axis=1)
df1.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,2,500,300
1,0,4,300,650
2,4,600,200,400
3,9,450,320,650
4,7,600,250,350


In [18]:
df1.dtypes

rate                     int64
sales_in_first_month     int64
sales_in_second_month    int64
sales_in_third_month     int64
dtype: object

In [19]:
# Saving the cleaned data to a new CSV file
df1.to_csv('Dataset/cleaned_sales.csv')

Step 5 - Train Model and Check Accuracy

In [20]:
# Fitting the model
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X, y)
clf.score(X, y)

0.6948637514051953

Step 6 - Dump and test a Model

In [21]:
# saving the model
import pickle
pickle.dump(clf, open('model.pkl', 'wb'))

In [23]:
# making prediction
model = pickle.load(open('model.pkl', 'rb'))
print(model.predict([[4, 300, 500]]))

[143.3072588]


