# Sales Data Analysis Using: ('sales_data.csv')
- George Garrelts and Craig Geter
- [Data Source](https://www.kaggle.com/datasets/aemyjutt/salesdata)
- [Raw Data File Download](sales_data.csv)


## Initalization/Data Cleaning: (RUN ALL 1ST)

### Initalizing Libraries and CSV Data File

In [82]:
#Libraries Imported for Analysis: Numpy, Pandas, Matplotlib and Seaborn
import numpy as np #Numerical Pyhton tool (Arrays,Mathmatical funcions to arrays, etc)
import pandas as pd #Data Manipulation tool (Loading CSV File,Filtering rows, Calculating Summary Statistics)
import matplotlib.pyplot as plt #Data Visualization tool (Visualize Trends)
import seaborn as sns # Data Visualization tool ( Distribution plots scatterplots with regression lines and confidence intervals)
from googletrans import Translator # Google translate library which translate columns

In [83]:
#Loads the CSV data file using Pandas into Dataframe (Variable df)

df = pd.read_csv('sales_data.csv')

### Reformatting Column Names

In [84]:
#Renaming Column Names to make neater

df.rename(columns={'catégorie': 'category'}, inplace=True) # renames Column from catégorie to Category
df.rename(columns={'Product_ean': 'product id'}, inplace=True) # Renames Product_ean (European Article Number) to Product ID for easier Comprehension
df.rename(columns={'turnover': 'revenue'}, inplace=True) # Renames the "turnover" column to "Revenue" for easier comprehension
df.rename(columns={'margin': 'profit margin'}, inplace=True) # Renames the "Margin" column to "Profit Margin"
df.columns = df.columns.str.lower().str.replace(' ', '_') # Replaces all spaces with undescores and makes all columns lowercase

In [85]:
# Changing Category to English 
# Category Column Has Words in different languages

df['category_english'] = df['category']
df.info()
df.head(3)
translator=Translator() # Translator Object
df['category_english'] = df['category'].apply(lambda x: translator.translate(x, src='auto', dest='en').text) #Translates the category and assigns it to english_category

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   order_date        185950 non-null  object 
 1   order_id          185950 non-null  int64  
 2   product           185950 non-null  object 
 3   product_id        185950 non-null  float64
 4   category          185950 non-null  object 
 5   purchase_address  185950 non-null  object 
 6   quantity_ordered  185950 non-null  int64  
 7   price_each        185950 non-null  float64
 8   cost_price        185950 non-null  float64
 9   revenue           185950 non-null  float64
 10  profit_margin     185950 non-null  float64
 11  category_english  185950 non-null  object 
dtypes: float64(5), int64(2), object(5)
memory usage: 17.0+ MB


AttributeError: 'Translator' object has no attribute 'raise_Exception'

### Fixing Data Types

In [None]:
#Fixing Order Date Data Types
df['order_date'] = pd.to_datetime(df['order_date']) #converts order_date datatype to datetime64
df.info() # Double Checks Data type is correct
df.head(3) # Outputs first 3 lines to see if correct

In [None]:
# Fixing Product ID Data Type

df['product_id'] = df['product_id'].astype('int64') # Converts Product_Id to Integer64
df.info()  # Double Checks Data type is correct
df.head(3) # Outputs first 3 lines to see if correct


### Checking for Null Values

In [None]:
#Data Cleaning : Checking For Null Values

df.isnull().sum() # Searches for any missing (null) values

order_date          0
order_id            0
product             0
product_id          0
category            0
purchase_address    0
quantity_ordered    0
price_each          0
cost_price          0
revenue             0
profit_margin       0
dtype: int64

## Data Analysis

In [None]:

df.head(20) # See the first "20" of rows of the dataframe


Unnamed: 0,order_date,order_id,product,product_id,category,purchase_address,quantity_ordered,price_each,cost_price,revenue,profit_margin
0,2019-01-22 21:25:00,141234,iPhone,5638009000000.0,Vêtements,"944 Walnut St, Boston, MA 02215",1,700.0,231.0,700.0,469.0
1,2019-01-28 14:15:00,141235,Lightning Charging Cable,5563320000000.0,Alimentation,"185 Maple St, Portland, OR 97035",1,14.95,7.475,14.95,7.475
2,2019-01-17 13:33:00,141236,Wired Headphones,2113973000000.0,Vêtements,"538 Adams St, San Francisco, CA 94016",2,11.99,5.995,23.98,11.99
3,2019-01-05 20:33:00,141237,27in FHD Monitor,3069157000000.0,Sports,"738 10th St, Los Angeles, CA 90001",1,149.99,97.4935,149.99,52.4965
4,2019-01-25 11:59:00,141238,Wired Headphones,9692681000000.0,Électronique,"387 10th St, Austin, TX 73301",1,11.99,5.995,11.99,5.995
5,2019-01-29 20:22:00,141239,AAA Batteries (4-pack),2953869000000.0,Alimentation,"775 Willow St, San Francisco, CA 94016",1,2.99,1.495,2.99,1.495
6,2019-01-26 12:16:00,141240,27in 4K Gaming Monitor,5173671000000.0,Vêtements,"979 Park St, Los Angeles, CA 90001",1,389.99,128.6967,389.99,261.2933
7,2019-01-05 12:04:00,141241,USB-C Charging Cable,8051737000000.0,Vêtements,"181 6th St, San Francisco, CA 94016",1,11.95,5.975,11.95,5.975
8,2019-01-01 10:30:00,141242,Bose SoundSport Headphones,1508418000000.0,Électronique,"867 Willow St, Los Angeles, CA 90001",1,99.99,49.995,99.99,49.995
9,2019-01-22 21:20:00,141243,Apple Airpods Headphones,1386344000000.0,Électronique,"657 Johnson St, San Francisco, CA 94016",1,150.0,97.5,150.0,52.5


In [None]:
df.describe () # To get a statistical Summary of the dataframe

Unnamed: 0,order_date,order_id,product_id,quantity_ordered,price_each,cost_price,revenue,profit_margin
count,185950,185950.0,185950.0,185950.0,185950.0,185950.0,185950.0,185950.0
mean,2019-07-18 21:54:38.887550208,230417.569379,5509211000000.0,1.124383,184.399735,69.668583,185.490917,115.289422
min,2019-01-01 03:07:00,141234.0,1000083000000.0,1.0,2.99,1.495,2.99,1.495
25%,2019-04-16 21:05:15,185831.25,3254280000000.0,1.0,11.95,5.975,11.95,5.975
50%,2019-07-17 20:40:30,230367.5,5511235000000.0,1.0,14.95,7.475,14.95,7.475
75%,2019-10-26 08:14:00,275035.75,7765195000000.0,1.0,150.0,97.5,150.0,52.5
max,2020-01-01 05:13:00,319670.0,9999983000000.0,9.0,1700.0,561.0,3400.0,2278.0
std,,51512.73711,2598403000000.0,0.442793,332.73133,109.424191,332.919771,225.22719


In [None]:
# Data Types and Null Values

df.info() # Get Information about data types and null values

"""
# Float 64 (Decimal Numbers): 5 Columns
# Int64 (Whole Numbers): 2 Columns
# Object (Mix Data Type: Numbers and Text): 4 columns
# 11 Columns total 
"""

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   order_date        185950 non-null  datetime64[ns]
 1   order_id          185950 non-null  int64         
 2   product           185950 non-null  object        
 3   product_id        185950 non-null  float64       
 4   category          185950 non-null  object        
 5   purchase_address  185950 non-null  object        
 6   quantity_ordered  185950 non-null  int64         
 7   price_each        185950 non-null  float64       
 8   cost_price        185950 non-null  float64       
 9   revenue           185950 non-null  float64       
 10  profit_margin     185950 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(2), object(3)
memory usage: 15.6+ MB


'\n# Float 64 (Decimal Numbers): 5 Columns\n# Int64 (Whole Numbers): 2 Columns\n# Object (Mix Data Type: Numbers and Text): 4 columns\n# 11 Columns total \n'