In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

In [2]:
df = pd.read_csv('jollof_wars_rice_sales_messy.csv')

In [3]:
df.shape

(554, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Brand Name             554 non-null    object 
 1   Grain Type             554 non-null    object 
 2   Price per Bag (Naira)  511 non-null    float64
 3   City                   554 non-null    object 
 4   Country                554 non-null    object 
 5   Customer Rating        554 non-null    int64  
 6   Number of Reviews      554 non-null    int64  
 7   Month Sold             554 non-null    object 
 8   Units Sold             554 non-null    int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 39.1+ KB


In [5]:
df.head()

Unnamed: 0,Brand Name,Grain Type,Price per Bag (Naira),City,Country,Customer Rating,Number of Reviews,Month Sold,Units Sold
0,Tommy Tasty,basmati,12870.72,Kumasi,Ghana,2,137,December,969
1,Royal Stallion,long grain,9771.94,Accra,Ghana,2,298,September,230
2,Caprice,long grain,15698.12,Lagos,Nigeria,2,200,April,509
3,Caprice,basmati,12591.07,Kumasi,Ghana,3,257,April,364
4,Sunshine,basmati,15636.29,Kumasi,Ghana,2,219,August,351


In [6]:
df.duplicated().sum()

9

In [7]:
df.drop_duplicates(inplace= True)

In [8]:
df.duplicated().sum()

0

In [9]:
categorical_columns =  df.select_dtypes(include='object')

In [10]:
for column in categorical_columns:
    print(column + '\n') 
    print(df[column].unique())

Brand Name

['Tommy Tasty' 'Royal Stallion' 'Caprice' 'Sunshine' 'Golden Harvest'
 'Caprce' "King's Pride" 'Uncle Sam' 'Mama Gold' 'Mamma Gold' 'Uncl Sam'
 'Royal Stallon']
Grain Type

['basmati' 'long grain' 'local' 'Basmati' 'LONG GRAIN' 'LOCAL' 'BASMATI'
 'Long grain' 'Local']
City

['Kumasi' 'Accra' 'Lagos' 'Abuja' 'Kumaci' 'lagos' 'Acccra']
Country

['Ghana' 'Nigeria']
Month Sold

['December' 'September' 'April' 'August' 'January' 'October' 'March'
 'February' 'May' 'June' 'July' 'November']


In [11]:
for column in categorical_columns:
   
    df[column]= df[column].astype('str')

In [12]:
import re

for column in categorical_columns:
    df[column] = (
        df[column]
        
        .str.lower()
        .str.replace(r'\s+', ' ', regex=True)  # replace all whitespace with single space
        .str.strip()
    )


In [13]:
for column in categorical_columns:
    print(column + '\n') 
    print(df[column].unique())

Brand Name

['tommy tasty' 'royal stallion' 'caprice' 'sunshine' 'golden harvest'
 'caprce' "king's pride" 'uncle sam' 'mama gold' 'mamma gold' 'uncl sam'
 'royal stallon']
Grain Type

['basmati' 'long grain' 'local']
City

['kumasi' 'accra' 'lagos' 'abuja' 'kumaci' 'acccra']
Country

['ghana' 'nigeria']
Month Sold

['december' 'september' 'april' 'august' 'january' 'october' 'march'
 'february' 'may' 'june' 'july' 'november']


In [14]:
df['City'].replace({"kumaci":"kumasi","acccra":"accra"},inplace=True)
df['Brand Name'].replace({"tommy tasty":"tommytasty",
                         "royal stallion":"royalstallion"
                         ""},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['City'].replace({"kumaci":"kumasi","acccra":"accra"},inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Brand Name'].replace({"tommy tasty":"tommytasty",


In [15]:
for column in categorical_columns:
    print(column + '\n') 
    print(df[column].unique())

Brand Name

['tommytasty' 'royalstallion' 'caprice' 'sunshine' 'golden harvest'
 'caprce' "king's pride" 'uncle sam' 'mama gold' 'mamma gold' 'uncl sam'
 'royal stallon']
Grain Type

['basmati' 'long grain' 'local']
City

['kumasi' 'accra' 'lagos' 'abuja']
Country

['ghana' 'nigeria']
Month Sold

['december' 'september' 'april' 'august' 'january' 'october' 'march'
 'february' 'may' 'june' 'july' 'november']
