In [1]:

# Load in some packages
import calendar
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# load dataset
electron_sales_df =pd.read_excel(r"C:\Users\jki\Downloads\electronic sales\electronic sales.xlsx")  
electron_sales_df 


Unnamed: 0,Order ID,Product,Quantity,Price,Date,Address
0,176558.0,USB-C Charging Cable,2.0,12.0,2019-04-19,"917 1st St, Dallas, TX 75001"
1,176559.0,Bose SoundSport Headphones,1.0,100.0,2019-04-07,"682 Chestnut St, Boston, MA 02215"
2,176560.0,Google Phone,1.0,600.0,2019-04-12,"669 Spruce St, Los Angeles, CA 90001"
3,176560.0,Wired Headphones,1.0,12.0,2019-04-12,"669 Spruce St, Los Angeles, CA 90001"
4,176561.0,Wired Headphones,1.0,12.0,2019-04-30,"333 8th St, Los Angeles, CA 90001"
...,...,...,...,...,...,...
65530,219872.0,Wired Headphones,1.0,12.0,2019-06-27,"846 9th St, Seattle, WA 98101"
65531,219873.0,AAA Batteries (4-pack),1.0,3.0,2019-06-18,"838 12th St, Los Angeles, CA 90001"
65532,219874.0,USB-C Charging Cable,1.0,12.0,2019-06-13,"925 Chestnut St, Atlanta, GA 30301"
65533,219875.0,Wired Headphones,1.0,12.0,2019-06-13,"692 9th St, Boston, MA 02215"


In [2]:
# let see if we have missing values
missing_values = electron_sales_df.isna().sum()
print(missing_values)

Order ID    323
Product     323
Quantity    323
Price       323
Date        323
Address     323
dtype: int64


In [14]:
# lets remove missing values
electron_sales_df.dropna(subset=['Order ID'],inplace=True)
electron_sales_df.dropna(subset=['Product'],inplace=True)
electron_sales_df.dropna(subset=['Quantity'],inplace=True)
electron_sales_df.dropna(subset=['Price'],inplace=True)
electron_sales_df.dropna(subset=['Date'],inplace=True)
electron_sales_df.dropna(subset=['Address'],inplace=True)

# let see if we have missing values
missing_values = electron_sales_df.isna().sum()
print(missing_values)

Order ID    0
Product     0
Quantity    0
Price       0
Date        0
Address     0
dtype: int64


In [4]:
# lets change the date format
electron_sales_df['Date'] = pd.to_datetime(electron_sales_df['Date'])
electron_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65535 entries, 0 to 65534
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Order ID  65212 non-null  float64       
 1   Product   65212 non-null  object        
 2   Quantity  65212 non-null  float64       
 3   Price     65212 non-null  float64       
 4   Date      65212 non-null  datetime64[ns]
 5   Address   65212 non-null  object        
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 3.0+ MB


In [6]:
# Numerical features 'Quantity Ordered' and 'Price Each' are object types. We should trasform them to numerical type, in order to perform mathematical calculations, with pandas function to_numeric, which converts each feature to the coresponding numerical type:

electron_sales_df['Quantity'] = pd.to_numeric(electron_sales_df['Quantity']) # Convert to int
electron_sales_df['Price'] = pd.to_numeric(electron_sales_df['Price']) # Convert to float

electron_sales_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65535 entries, 0 to 65534
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Order ID  65212 non-null  float64       
 1   Product   65212 non-null  object        
 2   Quantity  65212 non-null  float64       
 3   Price     65212 non-null  float64       
 4   Date      65212 non-null  datetime64[ns]
 5   Address   65212 non-null  object        
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 3.0+ MB


In [7]:
# Creating a new variable can be as simple as taking one variable and adding, multiplying or dividing by another. Let's create a new variable, Month, from 'Order Date':



electron_sales_df['Month'] =electron_sales_df['Date'].dt.month
electron_sales_df['Month'].describe()


count    65212.000000
mean         4.215696
std          2.229499
min          1.000000
25%          2.000000
50%          4.000000
75%          6.000000
max         12.000000
Name: Month, dtype: float64

In [8]:
# Next, we should really create a column featuring 'Sales', which we will create using 'Quantity Ordered' and 'Price Each':

electron_sales_df['Sales'] = electron_sales_df['Quantity'] * electron_sales_df['Price']
electron_sales_df['Sales'].describe()

count    65212.000000
mean       185.812933
std        331.170547
min          3.000000
25%         12.000000
50%         15.000000
75%        150.000000
max       3400.000000
Name: Sales, dtype: float64

In [11]:
# Lets Add City Colum

def get_city(address):
    if isinstance(address, str):
        return address.split(",")[1].strip(" ")
    else:
        return None  # or any default value you prefer for non-string addresses

def get_state(address):
    if isinstance(address, str):
        return address.split(",")[2].split(" ")[1]
    else:
        return None  # or any default value you prefer for non-string addresses

electron_sales_df['City'] = electron_sales_df['Address'].apply(lambda x: f"{get_city(x)}  ({get_state(x)})")
electron_sales_df.head()

Unnamed: 0,Order ID,Product,Quantity,Price,Date,Address,Month,Sales,City
0,176558.0,USB-C Charging Cable,2.0,12.0,2019-04-19,"917 1st St, Dallas, TX 75001",4.0,24.0,Dallas (TX)
1,176559.0,Bose SoundSport Headphones,1.0,100.0,2019-04-07,"682 Chestnut St, Boston, MA 02215",4.0,100.0,Boston (MA)
2,176560.0,Google Phone,1.0,600.0,2019-04-12,"669 Spruce St, Los Angeles, CA 90001",4.0,600.0,Los Angeles (CA)
3,176560.0,Wired Headphones,1.0,12.0,2019-04-12,"669 Spruce St, Los Angeles, CA 90001",4.0,12.0,Los Angeles (CA)
4,176561.0,Wired Headphones,1.0,12.0,2019-04-30,"333 8th St, Los Angeles, CA 90001",4.0,12.0,Los Angeles (CA)


In [13]:
# lets extract the hour from date column
electron_sales_df['Hour'] = electron_sales_df['Date'].dt.hour
electron_sales_df['Hour'].describe()

count    65212.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: Hour, dtype: float64