# Exploratory Data Analysis

In [55]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("ggplot")
from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams["figure.figsize"] = (12,8)

In [2]:
df = pd.read_csv("All_Cigarette_and_Tobacco_Retailers.csv")
df.head(10)

Unnamed: 0,Taxpayer Id,Taxpayer Name,Location Name,Address,City,State,Zip,Zip+4,County,Location Number,Permit Type,Permit Number,Permit Status,Permit Begin Date,Permit End Date
0,10106415143,"SHELLIE CURTIS, INC.",MIGALITOS SUPERMERCADO #3,3939 S POLK ST STE 323,DALLAS,TX,75224,4423.0,DALLAS,5,CIG/TOB RETAILER,22011140.0,ACTIVE,08/26/2022,
1,12011638892,"BAYWAY BUSINESS, INC.",R D FOOD MART,6114 DECKER DR,BAYTOWN,TX,77520,1444.0,HARRIS,1,E-CIG RETAILER,22000106.0,ACTIVE,01/01/2022,
2,10106415143,"SHELLIE CURTIS, INC.",MIGALITOS SUPERMERCADO #4,333 S SAINT AUGUSTINE DR,DALLAS,TX,75217,7488.0,DALLAS,6,CIG/TOB RETAILER,22011243.0,ACTIVE,08/26/2022,
3,12010833395,JTP LLC,PORTER'S,703 E FRONT ST,MIDLAND,TX,79701,4834.0,MIDLAND,9,CIG/TOB RETAILER,19000685.0,ACTIVE,03/18/2019,
4,10106576951,"SOFIAN, INC.",SDS FOOD MART,4924 WHITE SETTLEMENT RD,FORT WORTH,TX,76114,3919.0,TARRANT,1,CIG/TOB RETAILER,3004834.0,ACTIVE,01/01/2004,
5,12036882780,"AUSTIN PETROLEUM, INC.",BREAKTIME,3502 MANGUM RD,HOUSTON,TX,77092,7418.0,HARRIS,5,E-CIG RETAILER,21010591.0,INACTIVE,01/01/2022,11/02/2022
6,10106931867,STARCO OIL CO INC.,TOBACCO STOP,117 INTERSTATE 45 S,HUNTSVILLE,TX,77340,4243.0,WALKER,4,CIG/TOB RETAILER,12003229.0,ACTIVE,05/21/2012,
7,10106931867,STARCO OIL CO INC.,TOBACCO STOP,117 INTERSTATE 45 S,HUNTSVILLE,TX,77340,4243.0,WALKER,4,E-CIG RETAILER,21013363.0,ACTIVE,01/01/2022,
8,10105547839,"S S S ENTERPRISES, INC.",ON THE GO MART,8901 HOWARD DR,HOUSTON,TX,77017,5430.0,HARRIS,2,E-CIG RETAILER,22002640.0,ACTIVE,02/05/2022,
9,10106136012,"YAM ENTERPRISES, INC.",STOP N SHOP #2,1301 AVENUE I,ROSENBERG,TX,77471,3205.0,FORT BEND,2,CIG/TOB RETAILER,2003938.0,ACTIVE,07/01/2002,


In [3]:
#Checking for data types for each column

df.dtypes

Taxpayer Id            int64
Taxpayer Name         object
Location Name         object
Address               object
City                  object
State                 object
Zip                    int64
Zip+4                float64
County                object
Location Number        int64
Permit Type           object
Permit Number        float64
Permit Status         object
Permit Begin Date     object
Permit End Date       object
dtype: object

In [4]:
# Checking for row and column size 

df.shape

(73601, 15)

In [5]:
# Checking for columns with null value 

df.isnull().sum()

Taxpayer Id              0
Taxpayer Name            0
Location Name            1
Address                  0
City                     0
State                    0
Zip                      0
Zip+4                 7932
County                  37
Location Number          0
Permit Type              0
Permit Number          802
Permit Status            0
Permit Begin Date        0
Permit End Date      59506
dtype: int64

In [6]:
# observing the row information with null value in the location name so as to deduce name from other rows with similar properties

df.loc[df["Location Name"].isnull()].index

Int64Index([66527], dtype='int64')

In [7]:
df.iloc[66527]

Taxpayer Id               32083270028
Taxpayer Name        ADRIENNE VASQUEZ
Location Name                     NaN
Address                 2810 BOONE DR
City                       LAGO VISTA
State                              TX
Zip                             78645
Zip+4                            7201
County                         TRAVIS
Location Number                     1
Permit Type            E-CIG RETAILER
Permit Number                     NaN
Permit Status                INACTIVE
Permit Begin Date          08/31/2022
Permit End Date                   NaN
Name: 66527, dtype: object

In [8]:
# Checking for rows with city LAGO VISTA and County TRAVIS in order to deduce location name with null value 

(df[(df['County']=="TRAVIS") & (df['City']== "LAGO VISTA")])

Unnamed: 0,Taxpayer Id,Taxpayer Name,Location Name,Address,City,State,Zip,Zip+4,County,Location Number,Permit Type,Permit Number,Permit Status,Permit Begin Date,Permit End Date
5886,16111931362,"DOLGENCORP OF TEXAS, INC.",DOLLAR GENERAL # 10554,8017 BRONCO LN,LAGO VISTA,TX,78645,4528.0,TRAVIS,1223,CIG/TOB RETAILER,13001385.0,ACTIVE,01/15/2013,
10521,17422287924,CEFCO A PARTNERSHIP,CEFCO #36,20520 FM 1431,LAGO VISTA,TX,78645,4458.0,TRAVIS,97,CIG/TOB RETAILER,8002772.0,ACTIVE,07/01/2008,
10590,17422287924,CEFCO A PARTNERSHIP,CEFCO #36,20520 FM 1431,LAGO VISTA,TX,78645,4458.0,TRAVIS,97,E-CIG RETAILER,22006383.0,ACTIVE,05/01/2022,
12503,17429117819,"HILLSIDE GROCERY, INC.",HILLSIDE GROCERY,7501 LOHMANS FORD RD,LAGO VISTA,TX,78645,4741.0,TRAVIS,1,CIG/TOB RETAILER,99001645.0,ACTIVE,05/01/1999,
12504,17429117819,"HILLSIDE GROCERY, INC.",HILLSIDE GROCERY,7501 LOHMANS FORD RD,LAGO VISTA,TX,78645,4741.0,TRAVIS,1,E-CIG RETAILER,21007132.0,ACTIVE,01/01/2022,
16888,17515744815,"PAY AND SAVE, INC.",LOWE'S #161,7708 LOHMANS FORD RD,LAGO VISTA,TX,78645,4781.0,TRAVIS,190,CIG/TOB RETAILER,12000941.0,ACTIVE,01/18/2012,
18388,17526928399,"BROOKSHIRE BROTHERS, INC.",BROOKSHIRE BROTHERS #117,8087 BRONCO LN,LAGO VISTA,TX,78645,4528.0,TRAVIS,259,CIG/TOB RETAILER,23000045.0,ACTIVE,04/01/2023,
19058,17532312828,"POINT VENTURE RESTAURANT GROUP, INC.",THE GNARLY GAR,18200 LAKEPOINT CV,LAGO VISTA,TX,78645,8712.0,TRAVIS,1,CIG/TOB RETAILER,,INACTIVE,04/01/2007,05/31/2022
28305,32040270913,LAGO NAYAAB ENTERPRISES INC,SLR GROCERY,6004 LOHMANS FORD RD,LAGO VISTA,TX,78645,5283.0,TRAVIS,1,CIG/TOB RETAILER,9004261.0,ACTIVE,11/10/2009,
28306,32040270913,LAGO NAYAAB ENTERPRISES INC,SLR GROCERY,6004 LOHMANS FORD RD,LAGO VISTA,TX,78645,5283.0,TRAVIS,1,E-CIG RETAILER,21010761.0,ACTIVE,01/01/2022,


In the absence of any row with corresponding properties with that of the row with missing location name (index 66527) by which the loaction can be deduced, the row will be dropped 

In [9]:
# dropping location row with null value

df.dropna(subset=["Location Name"], axis = 0, inplace = True)

In [10]:
# confirming row drop

df.shape

(73600, 15)

In [11]:
# Changing "Permit Begin Date" and "Permit End Date" column data type to datetime64

df[["Permit Begin Date", "Permit End Date"]] = df[["Permit Begin Date", "Permit End Date"]].astype("datetime64[ns]")

In [12]:
# checking for confirmation

df.dtypes

Taxpayer Id                   int64
Taxpayer Name                object
Location Name                object
Address                      object
City                         object
State                        object
Zip                           int64
Zip+4                       float64
County                       object
Location Number               int64
Permit Type                  object
Permit Number               float64
Permit Status                object
Permit Begin Date    datetime64[ns]
Permit End Date      datetime64[ns]
dtype: object

In [13]:
df.head(10)

Unnamed: 0,Taxpayer Id,Taxpayer Name,Location Name,Address,City,State,Zip,Zip+4,County,Location Number,Permit Type,Permit Number,Permit Status,Permit Begin Date,Permit End Date
0,10106415143,"SHELLIE CURTIS, INC.",MIGALITOS SUPERMERCADO #3,3939 S POLK ST STE 323,DALLAS,TX,75224,4423.0,DALLAS,5,CIG/TOB RETAILER,22011140.0,ACTIVE,2022-08-26,NaT
1,12011638892,"BAYWAY BUSINESS, INC.",R D FOOD MART,6114 DECKER DR,BAYTOWN,TX,77520,1444.0,HARRIS,1,E-CIG RETAILER,22000106.0,ACTIVE,2022-01-01,NaT
2,10106415143,"SHELLIE CURTIS, INC.",MIGALITOS SUPERMERCADO #4,333 S SAINT AUGUSTINE DR,DALLAS,TX,75217,7488.0,DALLAS,6,CIG/TOB RETAILER,22011243.0,ACTIVE,2022-08-26,NaT
3,12010833395,JTP LLC,PORTER'S,703 E FRONT ST,MIDLAND,TX,79701,4834.0,MIDLAND,9,CIG/TOB RETAILER,19000685.0,ACTIVE,2019-03-18,NaT
4,10106576951,"SOFIAN, INC.",SDS FOOD MART,4924 WHITE SETTLEMENT RD,FORT WORTH,TX,76114,3919.0,TARRANT,1,CIG/TOB RETAILER,3004834.0,ACTIVE,2004-01-01,NaT
5,12036882780,"AUSTIN PETROLEUM, INC.",BREAKTIME,3502 MANGUM RD,HOUSTON,TX,77092,7418.0,HARRIS,5,E-CIG RETAILER,21010591.0,INACTIVE,2022-01-01,2022-11-02
6,10106931867,STARCO OIL CO INC.,TOBACCO STOP,117 INTERSTATE 45 S,HUNTSVILLE,TX,77340,4243.0,WALKER,4,CIG/TOB RETAILER,12003229.0,ACTIVE,2012-05-21,NaT
7,10106931867,STARCO OIL CO INC.,TOBACCO STOP,117 INTERSTATE 45 S,HUNTSVILLE,TX,77340,4243.0,WALKER,4,E-CIG RETAILER,21013363.0,ACTIVE,2022-01-01,NaT
8,10105547839,"S S S ENTERPRISES, INC.",ON THE GO MART,8901 HOWARD DR,HOUSTON,TX,77017,5430.0,HARRIS,2,E-CIG RETAILER,22002640.0,ACTIVE,2022-02-05,NaT
9,10106136012,"YAM ENTERPRISES, INC.",STOP N SHOP #2,1301 AVENUE I,ROSENBERG,TX,77471,3205.0,FORT BEND,2,CIG/TOB RETAILER,2003938.0,ACTIVE,2002-07-01,NaT


In [14]:
#  Checking for retail purchase per state

df["State"].value_counts()

TX    73563
CA        8
FL        5
NY        4
NV        2
VA        2
PA        2
NC        2
CO        2
NJ        1
MO        1
MD        1
SC        1
RI        1
AZ        1
KY        1
MN        1
WA        1
DE        1
Name: State, dtype: int64

In [15]:
# Filtering dataset into Texas as the state with the highest retail purchase by great margin

df = df[df["State"]== "TX"]

In [36]:
# Changing notebook display settings to enable view of all rows

pd.set_option("display.max_rows", None)

In [68]:
# List of cities with retail purchase in Texas

df["City"].value_counts()

HOUSTON    284
Name: City, dtype: int64

In [18]:
# Filtering dataset into retail purchase from HOUSTON

df = df[df["City"]== "HOUSTON"]
df.head(5)

Unnamed: 0,Taxpayer Id,Taxpayer Name,Location Name,Address,City,State,Zip,Zip+4,County,Location Number,Permit Type,Permit Number,Permit Status,Permit Begin Date,Permit End Date
5,12036882780,"AUSTIN PETROLEUM, INC.",BREAKTIME,3502 MANGUM RD,HOUSTON,TX,77092,7418.0,HARRIS,5,E-CIG RETAILER,21010591.0,INACTIVE,2022-01-01,2022-11-02
8,10105547839,"S S S ENTERPRISES, INC.",ON THE GO MART,8901 HOWARD DR,HOUSTON,TX,77017,5430.0,HARRIS,2,E-CIG RETAILER,22002640.0,ACTIVE,2022-02-05,NaT
14,12624594912,"TEXAS PETROLEUM GROUP, LLC",TIMEWISE #839,8155 KATY FWY,HOUSTON,TX,77024,1909.0,HARRIS,195,CIG/TOB RETAILER,21012908.0,INACTIVE,2021-12-30,2022-05-23
61,10438331828,"ANRUS, INC.",MI TIENDA MEAT MARKET,10211 CLUB CREEK DR,HOUSTON,TX,77036,7105.0,HARRIS,1,CIG/TOB RETAILER,6001455.0,ACTIVE,2006-03-01,NaT
62,10305251034,"KUIFS PETROLEUM, L.P.",KS #32,8055 N SAM HOUSTON PKWY W STE A,HOUSTON,TX,77064,3456.0,HARRIS,34,E-CIG RETAILER,21010157.0,ACTIVE,2022-01-01,NaT


In [20]:
# Checking for County with retail purchase

df["County"].value_counts()

HARRIS        10186
FORT BEND        84
MONTGOMERY        3
Name: County, dtype: int64

In [69]:
# Filtering dataset into retail purchase from HARRIS county as the county with the most retail purchase

df = df[df["County"]== "HARRIS"]
df.head(5)

Unnamed: 0,Taxpayer Id,Taxpayer Name,Location Name,Address,City,State,Zip,Zip+4,County,Location Number,Permit Type,Permit Number,Permit Status,Permit Begin Date,Permit End Date
897,12017692026,ENLIGHT INVESTMENTS CORPORATION,SUPER K FOOD MART,4302 HICKORY DOWNS DR,HOUSTON,TX,77084,3516.0,HARRIS,1,CIG/TOB RETAILER,4005738.0,ACTIVE,2004-11-01,NaT
898,12017692026,ENLIGHT INVESTMENTS CORPORATION,SUPER K FOOD MART,4302 HICKORY DOWNS DR,HOUSTON,TX,77084,3516.0,HARRIS,1,E-CIG RETAILER,21008786.0,ACTIVE,2022-01-01,NaT
1465,12048616754,"SUPERMERCADO TELOLOAPAN # 8, INC.",TELOLOAPAN MEAT MARKET #8,5420 HIGHWAY 6 N,HOUSTON,TX,77084,1826.0,HARRIS,2,CIG/TOB RETAILER,8000323.0,ACTIVE,2007-11-01,NaT
1617,12055498534,"ZRN, LLC",GREENHOUSE FOOD MART,3207 GREENHOUSE RD,HOUSTON,TX,77084,4419.0,HARRIS,49,CIG/TOB RETAILER,21009719.0,ACTIVE,2021-12-15,NaT
1618,12055498534,"ZRN, LLC",GREENHOUSE FOOD MART,3207 GREENHOUSE RD,HOUSTON,TX,77084,4419.0,HARRIS,49,E-CIG RETAILER,22004360.0,ACTIVE,2022-02-16,NaT


In [31]:
# Checking for zip with the most retail purchase

df["Zip"].value_counts()

77084    284
77036    248
77015    222
77070    215
77077    208
        ... 
77339      2
77546      2
77598      1
77201      1
77449      1
Name: Zip, Length: 104, dtype: int64

In [33]:
# Filtering dataset into zip address with the higest retail purchase

df = df[df["Zip"]== 77084]
df.head(10)

Unnamed: 0,Taxpayer Id,Taxpayer Name,Location Name,Address,City,State,Zip,Zip+4,County,Location Number,Permit Type,Permit Number,Permit Status,Permit Begin Date,Permit End Date
897,12017692026,ENLIGHT INVESTMENTS CORPORATION,SUPER K FOOD MART,4302 HICKORY DOWNS DR,HOUSTON,TX,77084,3516.0,HARRIS,1,CIG/TOB RETAILER,4005738.0,ACTIVE,2004-11-01,NaT
898,12017692026,ENLIGHT INVESTMENTS CORPORATION,SUPER K FOOD MART,4302 HICKORY DOWNS DR,HOUSTON,TX,77084,3516.0,HARRIS,1,E-CIG RETAILER,21008786.0,ACTIVE,2022-01-01,NaT
1465,12048616754,"SUPERMERCADO TELOLOAPAN # 8, INC.",TELOLOAPAN MEAT MARKET #8,5420 HIGHWAY 6 N,HOUSTON,TX,77084,1826.0,HARRIS,2,CIG/TOB RETAILER,8000323.0,ACTIVE,2007-11-01,NaT
1617,12055498534,"ZRN, LLC",GREENHOUSE FOOD MART,3207 GREENHOUSE RD,HOUSTON,TX,77084,4419.0,HARRIS,49,CIG/TOB RETAILER,21009719.0,ACTIVE,2021-12-15,NaT
1618,12055498534,"ZRN, LLC",GREENHOUSE FOOD MART,3207 GREENHOUSE RD,HOUSTON,TX,77084,4419.0,HARRIS,49,E-CIG RETAILER,22004360.0,ACTIVE,2022-02-16,NaT
1641,12056256899,VALLIANI ENTERPRISES INC.,GREENHOUSE CITGO,3207 GREENHOUSE RD,HOUSTON,TX,77084,4419.0,HARRIS,1,CIG/TOB RETAILER,6005978.0,INACTIVE,2006-11-10,2022-01-31
2137,12603039020,"RUBEN FOODS, INC",GARCIA'S FOOD STORE,17550 W LITTLE YORK RD STE 1,HOUSTON,TX,77084,6321.0,HARRIS,1,CIG/TOB RETAILER,7002747.0,ACTIVE,2007-07-15,NaT
2181,12604310693,"ALCONOMY GROUP, INC.",ALCO LIQUOR,2919 BARKER CYPRESS RD STE K,HOUSTON,TX,77084,6869.0,HARRIS,2,CIG/TOB RETAILER,13002823.0,ACTIVE,2013-05-01,NaT
2519,12624559725,"MALIK & SONS, INC.",ONE STOP - DEERFIELD / CHAPIN TIRES,18323 CLAY RD,HOUSTON,TX,77084,3915.0,HARRIS,1,CIG/TOB RETAILER,8002774.0,INACTIVE,2008-07-01,2021-05-15
2693,12624594912,"TEXAS PETROLEUM GROUP, LLC",TIMEWISE #829,4255 HIGHWAY 6 N,HOUSTON,TX,77084,5411.0,HARRIS,185,CIG/TOB RETAILER,21012903.0,INACTIVE,2021-12-30,2022-05-23


In [59]:
# Checking for number of unique retailers per address

Grouped_Zip_and_Address = df.groupby(["Address"])["Taxpayer Id"].nunique()
Grouped_Zip_and_Address.head(10)

Address
1206 BARKER CYPRESS RD             1
1219 HIGHWAY 6 N                   2
1430 BARKER CYPRESS RD             1
14838 PARK ROW DR STE D            1
1520 BARKER CYPRESS RD             2
15410 W LITTLE YORK RD             1
15410 W LITTLE YORK RD STE A       1
1550 FRY RD                        1
15852 YORKTOWN CROSSING PKWY       1
16212 LOCH KATRINE LN STE AANDB    1
Name: Taxpayer Id, dtype: int64

In [61]:
# Checking number of retail purchase per address

df["Address"].value_counts()

1893 BARKER CYPRESS RD             6
4521 HIGHWAY 6 N STE F             5
18311 CLAY RD STE B2               5
4255 HIGHWAY 6 N                   5
18324 CLAY RD                      5
2919 BARKER CYPRESS RD STE E       4
17102 W LITTLE YORK RD             4
6102 HIGHWAY 6 N                   4
6495 BARKER CYPRESS RD STE A       4
5920 HIGHWAY 6 N STE 1002          4
1219 HIGHWAY 6 N                   4
4351 HIGHWAY 6 N                   4
1635 BARKER CYPRESS RD             3
4820 HIGHWAY 6 N                   3
18323 CLAY RD                      3
6172 HIGHWAY 6 N                   3
17111 KIETH HARROW BLVD STE A      3
5210 BARKER CYPRESS RD             3
16503 CLAY RD STE A                3
3207 GREENHOUSE RD                 3
17930 W LITTLE YORK RD             3
2112 FRY RD STE B                  3
5418 BARKER CYPRESS RD             3
4011 HIGHWAY 6 N                   3
5803 BARKER CYPRESS RD             3
17904 W LITTLE YORK RD STE E       3
17702 KIETH HARROW BLVD            3
1

In [52]:
# Checking address with the most retail purchase

Address_most_retail_purchase = (df['Address'].value_counts()[df['Address'].value_counts()> 3])

Address_most_retail_purchase

1893 BARKER CYPRESS RD          6
4521 HIGHWAY 6 N STE F          5
18311 CLAY RD STE B2            5
4255 HIGHWAY 6 N                5
18324 CLAY RD                   5
2919 BARKER CYPRESS RD STE E    4
17102 W LITTLE YORK RD          4
6102 HIGHWAY 6 N                4
6495 BARKER CYPRESS RD STE A    4
5920 HIGHWAY 6 N STE 1002       4
1219 HIGHWAY 6 N                4
4351 HIGHWAY 6 N                4
Name: Address, dtype: int64