# AN OVERVIEW OF THE DATASET

This dataset includes a record for every Starbucks or subsidiary store location currently in operation as of February 2017.

In [1]:
# Import the pandas and numpy library
import pandas as pd
import numpy as np

# Read the CSV file named 'directory.csv' into a pandas DataFrame and store it in the variable 'Starbucks'
Starbucks = pd.read_csv('directory.csv')

# Display the first 10 rows of the DataFrame using the head() function
Starbucks.head(10)


Unnamed: 0,Brand,Store Number,Store Name,Ownership Type,Street Address,City,State/Province,Country,Postcode,Phone Number,Timezone,Longitude,Latitude
0,Starbucks,47370-257954,"Meritxell, 96",Licensed,"Av. Meritxell, 96",Andorra la Vella,7,AD,AD500,376818720.0,GMT+1:00 Europe/Andorra,1.53,42.51
1,Starbucks,22331-212325,Ajman Drive Thru,Licensed,"1 Street 69, Al Jarf",Ajman,AJ,AE,,,GMT+04:00 Asia/Dubai,55.47,25.42
2,Starbucks,47089-256771,Dana Mall,Licensed,Sheikh Khalifa Bin Zayed St.,Ajman,AJ,AE,,,GMT+04:00 Asia/Dubai,55.47,25.39
3,Starbucks,22126-218024,Twofour 54,Licensed,Al Salam Street,Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.38,24.48
4,Starbucks,17127-178586,Al Ain Tower,Licensed,"Khaldiya Area, Abu Dhabi Island",Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.54,24.51
5,Starbucks,17688-182164,"Dalma Mall, Ground Floor",Licensed,"Dalma Mall, Mussafah",Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.49,24.4
6,Starbucks,18182-182165,"Dalma Mall, Level 1",Licensed,"Dalma Mall, Mussafah",Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.49,24.4
7,Starbucks,23359-229184,Debenhams Yas Mall,Licensed,Yas Island,Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.61,24.46
8,Starbucks,30781-99022,Khalidiya Street,Licensed,Khalidiya St.,Abu Dhabi,AZ,AE,,26670052.0,GMT+04:00 Asia/Muscat,55.69,24.19
9,Starbucks,20423-205465,Eastern Mangroves,Licensed,"Al Salam Street, The Mangroves",Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.38,24.48


### DESCRIPTIVE ANALYSIS

In [2]:
# Checking the count of rows and columns
Starbucks.shape

(25600, 13)

In [3]:
#Checking for the count of non empty columns and the data type for each columns
Starbucks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25600 entries, 0 to 25599
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Brand           25600 non-null  object 
 1   Store Number    25600 non-null  object 
 2   Store Name      25600 non-null  object 
 3   Ownership Type  25600 non-null  object 
 4   Street Address  25598 non-null  object 
 5   City            25585 non-null  object 
 6   State/Province  25600 non-null  object 
 7   Country         25600 non-null  object 
 8   Postcode        24078 non-null  object 
 9   Phone Number    18739 non-null  object 
 10  Timezone        25600 non-null  object 
 11  Longitude       25599 non-null  float64
 12  Latitude        25599 non-null  float64
dtypes: float64(2), object(11)
memory usage: 2.5+ MB


# DATA WRANGLING

### HANDLING MISSING VALUES

In [4]:
# Check for missing values in the 'Starbucks' DataFrame and sum the counts for each column
Starbucks.isnull().sum()

Brand                0
Store Number         0
Store Name           0
Ownership Type       0
Street Address       2
City                15
State/Province       0
Country              0
Postcode          1522
Phone Number      6861
Timezone             0
Longitude            1
Latitude             1
dtype: int64

Some columns contain missing values, and for the nature of this dataset, analysis cannot be done with some key columns (like Lattitude, Longitude and city) having missing values.
So we drop the rows where those columns contain missing values

In [5]:
# Drop rows in the 'Starbucks' DataFrame where the specified columns ('latitude', 'longitude', 'city') contain missing values
Starbucks.dropna(subset=['Latitude', 'Longitude', 'City'], inplace = True)


In [6]:
# inspecting to see if the missing values in the selected columns have been dropped 
Starbucks.isnull().sum()

Brand                0
Store Number         0
Store Name           0
Ownership Type       0
Street Address       2
City                 0
State/Province       0
Country              0
Postcode          1507
Phone Number      6860
Timezone             0
Longitude            0
Latitude             0
dtype: int64

### HANDLING DUPLICATE VALUES

In [7]:
# Select and display rows that are duplicated in the Starbucks DataFrame
Starbucks[Starbucks.duplicated()]

Unnamed: 0,Brand,Store Number,Store Name,Ownership Type,Street Address,City,State/Province,Country,Postcode,Phone Number,Timezone,Longitude,Latitude


Note: for the purpose of this analysis, some columns (like'Store Number') should not have duplicate values values

In [8]:
# Check for duplicate values in the 'Store Number' columns 
Starbucks[Starbucks['Store Number'].duplicated()]



Unnamed: 0,Brand,Store Number,Store Name,Ownership Type,Street Address,City,State/Province,Country,Postcode,Phone Number,Timezone,Longitude,Latitude


### HANDLING INCONSISTENT DATA VALUES

Check for the relevant columns where the values must contain no errors or spaces and other inconsistencies

In [9]:
# Extract unique values from the 'Brand' column in the 'Starbucks' DataFrame
Starbucks['Brand'].unique()


array(['Starbucks', 'Teavana', 'Evolution Fresh', 'Coffee House Holdings'],
      dtype=object)

In [10]:
# Extract unique values from the 'Ownership Type' column in the 'Starbucks' DataFrame
Starbucks['Ownership Type'].unique()

array(['Licensed', 'Joint Venture', 'Company Owned', 'Franchise'],
      dtype=object)

In [11]:
# Extract unique values from the 'Ownership Type' column in the 'Starbucks' DataFrame
Starbucks['Country'].unique()

array(['AD', 'AE', 'AR', 'AT', 'AU', 'AW', 'AZ', 'BE', 'BG', 'BH', 'BN',
       'BO', 'BR', 'BS', 'CA', 'CH', 'CL', 'CN', 'CO', 'CR', 'CW', 'CY',
       'CZ', 'DE', 'DK', 'EG', 'ES', 'FI', 'FR', 'GB', 'GR', 'GT', 'HU',
       'ID', 'IE', 'IN', 'JO', 'JP', 'KH', 'KR', 'KW', 'KZ', 'LB', 'LU',
       'MA', 'MC', 'MX', 'MY', 'NL', 'NO', 'NZ', 'OM', 'PA', 'PE', 'PH',
       'PL', 'PR', 'PT', 'QA', 'RO', 'RU', 'SA', 'SE', 'SG', 'SK', 'SV',
       'TH', 'TR', 'TT', 'TW', 'US', 'VN', 'ZA'], dtype=object)

In [12]:
#Checking the count of rows and columns after cleaning
Starbucks.shape

(25584, 13)

### Saving the cleaned dataset

In [13]:
# Save the 'Starbucks' DataFrame to a CSV file named 'Starbucks_locations(CLEANED).csv'
# Set index=False to exclude the row index from the CSV file
Starbucks.to_csv('Starbucks_locations(CLEANED).csv', index = False)