<h1>Notebook to clean the Tigers Data Set</h1>
<p>data/DTIFanData_2-21-25.csv will be cleaned</p>

<h3>Dependencies</h3>

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# import Helpers from Helpers.py which is in same directory
from Helpers import *

<h3>Load the dataset</h3>

In [63]:
path = 'data/DTIFanData_3-25-25.csv'
# Load the data
data = pd.read_csv(path)
# Display the first few rows of the dataframe
print(data.head())

  data = pd.read_csv(path)


   KeepFlag  GlobalKey  SeasonKey FanSinceDate  \
0         1         11       2025     5/1/2009   
1         1         12       2025    2/25/2009   
2         1         15       2025     8/5/2008   
3         1         17       2025    6/24/2007   
4         1         19       2025     8/5/2008   

                  FirstGameAttended  TotalGamesAttended  \
0         2023-08-05 Tampa Bay Rays                 4.0   
1  2024-06-25 Philadelphia Phillies                 1.0   
2     2013-06-27 Los Angeles Angels                 6.0   
3      2015-07-05 Toronto Blue Jays                 8.0   
4          2013-07-14 Texas Rangers                 1.0   

                    FirstGameBought                    LastGameBought  \
0         2023-08-05 Tampa Bay Rays  2024-06-26 Philadelphia Phillies   
1  2024-06-25 Philadelphia Phillies  2024-06-25 Philadelphia Phillies   
2       2021-06-08 Seattle Mariners       2021-06-08 Seattle Mariners   
3         2021-06-26 Houston Astros      2024-09-29 

In [64]:
# Function to check for type errors in columns
def check_type_errors(df):
    type_errors = {}
    for column in df.columns:
        try:
            df[column].astype(df[column].dtype)
        except ValueError as e:
            type_errors[column] = str(e)
    return type_errors

# Check for type errors in the data
type_errors = check_type_errors(data)
print("Columns with type errors:", type_errors)

Columns with type errors: {}


In [65]:
# Get the data types of all columns
column_types = data.dtypes
for column, dtype in column_types.items():
    print(f"{column}: {dtype}")

KeepFlag: int64
GlobalKey: int64
SeasonKey: int64
FanSinceDate: object
FirstGameAttended: object
TotalGamesAttended: float64
FirstGameBought: object
LastGameBought: object
TotalTicketsPurchased: float64
TotalLifetimeValue: float64
CurrentSeasonEmailActivities: float64
PreviousSeasonsEmailActivities: float64
STMFlagCurr: int64
TicketingFanType: object
EmailFanType: object
FullSeasonBuyer: float64
HalfSeasonBuyer: float64
QuarterSeasonBuyer: float64
MiniPlanBuyer: float64
IndividualGameBuyer: float64
City: object
State: object
PostalCd: object
Country: object
Gender: object
Education: object
Occupation: object
Age: float64
MaritalStatus: object
PresenceOfChildren: object
DwellingType: object
HouseholdIncome: object
NetWorth: object
PrimaryVehicleType: object
MSADescription: object
MailSuppresionFlg: float64
WorkingWomanFlg: float64
BankCardHolderFlg: float64
GasDepartmentRetailCardHolderFlg: float64
TravelEntertainmentCardHolderFlg: float64
CreditCardHolderUnknownTypeFlg: float64
Premium

<h3>Cleaning State Values</h3>
<p>Keep states, provinces, and territories in US and Canada</p>
<p>Other locations will be put in 'OTHERS' category

In [66]:
state_counts = data['State'].value_counts()
state_counts = state_counts.reset_index()
state_counts.columns = ['State', 'Count']
print(state_counts.to_string())

                            State   Count
0                              MI  201900
1                              OH   11889
2                              ON    7658
3                              FL    7014
4                        MICHIGAN    6508
5                              NY    4725
6                              CA    3829
7                              IL    3653
8                              IN    3225
9                              TX    2564
10                        FLORIDA    2228
11                             PA    1770
12                           OHIO    1755
13                       ILLINOIS    1356
14                             MA    1283
15                             NC    1272
16                             VA    1264
17                     CALIFORNIA    1211
18                             GA    1207
19                             AZ    1191
20                        ONTARIO    1145
21                             TN    1068
22                             NJ 

In [67]:
# Mapping dictionary for state normalization
state_mapping = {
    'MI': 'MICHIGAN',
    'OH': 'OHIO',
    'IL': 'ILLINOIS',
    'IN': 'INDIANA',
    'WI': 'WISCONSIN',
    'MN': 'MINNESOTA',
    'IA': 'IOWA',
    'KY': 'KENTUCKY',
    'MO': 'MISSOURI',
    'PA': 'PENNSYLVANIA',
    'NY': 'NEW YORK',
    'NJ': 'NEW JERSEY',
    'CT': 'CONNECTICUT',
    'MD': 'MARYLAND',
    'VA': 'VIRGINIA',
    'NC': 'NORTH CAROLINA',
    'SC': 'SOUTH CAROLINA',
    'GA': 'GEORGIA',
    'FL': 'FLORIDA',
    'TX': 'TEXAS',
    'CA': 'CALIFORNIA',
    'WA': 'WASHINGTON',
    'OR': 'OREGON',
    'CO': 'COLORADO',
    'AZ': 'ARIZONA',
    'NV': 'NEVADA',
    'UT': 'UTAH',
    'ID': 'IDAHO',
    'MT': 'MONTANA',
    'WY': 'WYOMING',
    'ND': 'NORTH DAKOTA',
    'SD': 'SOUTH DAKOTA',
    'NE': 'NEBRASKA',
    'KS': 'KANSAS',
    'LA': 'LOUISIANA',
    'AR': 'ARKANSAS',
    'MS': 'MISSISSIPPI',
    'AL': 'ALABAMA',
    'TN': 'TENNESSEE',
    'WV': 'WEST VIRGINIA',
    'DE': 'DELAWARE',
    'VT': 'VERMONT',
    'NH': 'NEW HAMPSHIRE',
    'ME': 'MAINE',
    'HI': 'HAWAII',
    'AK': 'ALASKA',
    'NM': 'NEW MEXICO',
    'OK': 'OKLAHOMA',
    'MA': 'MASSACHUSETTS',
    'RI': 'RHODE ISLAND',
    # Non-state territories and regions
    'PR': 'PUERTO RICO',
    'DC': 'DISTRICT OF COLUMBIA',
    'AS': 'AMERICAN SAMOA',
    'GU': 'GUAM',
    'MP': 'NORTHERN MARIANA ISLANDS',
    'VI': 'VIRGIN ISLANDS',
    'FM': 'FEDERATED STATES OF MICRONESIA',
    'MH': 'MARSHALL ISLANDS',
    # International regions (if applicable)
    'OT': 'ONTARIO',  # Example for Ontario, Canada
    'ON': 'ONTARIO',  # Example for Ontario, Canada
    'Ontario': 'ONTARIO',  # Example for Ontario, Canada
    'NS': 'NOVA SCOTIA',  # Example for Nova Scotia, Canada
    'QC': 'QUEBEC',  # Example for Quebec, Canada
    'BC': 'BRITISH COLUMBIA',  # Example for British Columbia, Canada
    'AB': 'ALBERTA',  # Example for Alberta, Canada
    'SK': 'SASKATCHEWAN',  # Example for Saskatchewan, Canada
    'NL': 'NEWFOUNDLAND AND LABRADOR',  # Example for Newfoundland and Labrador, Canada
    'YT': 'YUKON',  # Example for Yukon, Canada
    'NT': 'NORTHWEST TERRITORIES',  # Example for Northwest Territories, Canada
    'NU': 'NUNAVUT',  # Example for Nunavut, Canada
    'MB': 'MANITOBA',  # Example for Manitoba, Canada
    'PE': 'PRINCE EDWARD ISLAND',  # Example for Prince Edward Island, Canada
    'NB': 'NEW BRUNSWICK'  # Example for New Brunswick, Canada
}

# Function to normalize state values
def normalize_state(state):
    state = state.strip().upper()
    return state_mapping.get(state, state)

# Apply the normalization function to the 'State' column
data['State'] = data['State'].apply(lambda x: normalize_state(x) if pd.notnull(x) else x)

# Display the normalized state counts
normalized_state_counts = data['State'].value_counts().reset_index()
normalized_state_counts.columns = ['State', 'Count']
print(normalized_state_counts.to_string())

                            State   Count
0                        MICHIGAN  208408
1                            OHIO   13644
2                         FLORIDA    9242
3                         ONTARIO    8803
4                        NEW YORK    5411
5                      CALIFORNIA    5040
6                        ILLINOIS    5009
7                         INDIANA    3914
8                           TEXAS    3426
9                    PENNSYLVANIA    2277
10                       VIRGINIA    1841
11                 NORTH CAROLINA    1819
12                        ARIZONA    1676
13                        GEORGIA    1624
14                      TENNESSEE    1505
15                  MASSACHUSETTS    1468
16                     NEW JERSEY    1345
17                      WISCONSIN    1193
18                       COLORADO    1118
19                       MARYLAND    1015
20                 SOUTH CAROLINA     871
21                       KENTUCKY     865
22                       MISSOURI 

In [68]:
# Get the counts of each state
state_counts = data['State'].value_counts()

# Function to update state values based on their occurrences
def update_state(state):
    if state not in state_mapping.values():
        return 'OTHERS'
    return state

# Function to group all values that are null as NULL
def update_Nullstate(state):
    if pd.isnull(state):
        return 'NULL'
    return state

# Apply the update function to the 'State' column
data['State'] = data['State'].apply(lambda x: update_state(x) if pd.notnull(x) else x)
data['State'] = data['State'].apply(lambda x: update_Nullstate(x))

# Display the updated state counts
updated_state_counts = data['State'].value_counts().reset_index()
updated_state_counts.columns = ['State', 'Count']
print(updated_state_counts.to_string())

                        State   Count
0                    MICHIGAN  208408
1                        NULL  119835
2                        OHIO   13644
3                     FLORIDA    9242
4                     ONTARIO    8803
5                    NEW YORK    5411
6                  CALIFORNIA    5040
7                    ILLINOIS    5009
8                     INDIANA    3914
9                       TEXAS    3426
10               PENNSYLVANIA    2277
11                   VIRGINIA    1841
12             NORTH CAROLINA    1819
13                    ARIZONA    1676
14                    GEORGIA    1624
15                  TENNESSEE    1505
16              MASSACHUSETTS    1468
17                     OTHERS    1353
18                 NEW JERSEY    1345
19                  WISCONSIN    1193
20                   COLORADO    1118
21                   MARYLAND    1015
22             SOUTH CAROLINA     871
23                   KENTUCKY     865
24                   MISSOURI     861
25          

<h3>Redo sections that analysts did to keep global key present</h3>

<h5>Gender</h5>

In [69]:
#Will create three columns: isMale, isFemale, isNaN based on gender column
#Print values in Gender column
print(data['Gender'].value_counts())
#Create new columns
data['isMale'] = data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
data['isFemale'] = data['Gender'].apply(lambda x: 1 if x == 'Female' else 0)
#isNan should have all that are not 'Male' or 'Female' or is Null
data['isNaN'] = data['Gender'].apply(lambda x: 1 if pd.isnull(x) else 0)
# Find about 10,000 more non NaN and not Male or Female and lump them into isNaN
non_male_female = data[(data['Gender'] != 'Male') & (data['Gender'] != 'Female') & (pd.notnull(data['Gender']))]
data.loc[non_male_female.index, 'isNaN'] = 1

print(data[['isMale', 'isFemale', 'isNaN']].sum())

# confirm no overlap between isMale, isFemale, and isNaN
# Confirm no overlap between isMale, isFemale, and isNaN
assert data[['isMale', 'isFemale', 'isNaN']].sum().sum() == data.shape[0]

# Ensure the sum of all the columns values equals the number of rows
assert data[['isMale', 'isFemale', 'isNaN']].sum(axis=1).eq(1).all()

Gender
Male         162691
Female        98350
<Unknown>     80088
Name: count, dtype: int64
isMale      162691
isFemale     98350
isNaN       152181
dtype: int64


In [70]:
# Drop the gender column
data.drop(columns=['Gender'], inplace=True)

<h5>Age Redo</h5>

In [71]:
#print columns list
column_types = data.dtypes
for column, dtype in column_types.items():
    print(f"{column}: {dtype}")

KeepFlag: int64
GlobalKey: int64
SeasonKey: int64
FanSinceDate: object
FirstGameAttended: object
TotalGamesAttended: float64
FirstGameBought: object
LastGameBought: object
TotalTicketsPurchased: float64
TotalLifetimeValue: float64
CurrentSeasonEmailActivities: float64
PreviousSeasonsEmailActivities: float64
STMFlagCurr: int64
TicketingFanType: object
EmailFanType: object
FullSeasonBuyer: float64
HalfSeasonBuyer: float64
QuarterSeasonBuyer: float64
MiniPlanBuyer: float64
IndividualGameBuyer: float64
City: object
State: object
PostalCd: object
Country: object
Education: object
Occupation: object
Age: float64
MaritalStatus: object
PresenceOfChildren: object
DwellingType: object
HouseholdIncome: object
NetWorth: object
PrimaryVehicleType: object
MSADescription: object
MailSuppresionFlg: float64
WorkingWomanFlg: float64
BankCardHolderFlg: float64
GasDepartmentRetailCardHolderFlg: float64
TravelEntertainmentCardHolderFlg: float64
CreditCardHolderUnknownTypeFlg: float64
PremiumCardHolderFlg: 

In [72]:
# print values of data['Age']
print(data['Age'].value_counts())

Age
42.0    13496
44.0    13473
40.0    12886
46.0    12883
52.0    12358
48.0    11952
50.0    11705
38.0    11634
54.0    10951
56.0    10365
36.0    10320
60.0     9902
58.0     9812
62.0     9453
64.0     8981
34.0     8903
32.0     8604
66.0     8177
30.0     7895
28.0     7516
68.0     6966
26.0     6790
24.0     6170
70.0     5882
22.0     5061
72.0     4617
74.0     3702
20.0     3313
76.0     2722
78.0     1553
18.0     1497
80.0     1284
82.0      747
84.0      557
86.0      399
99.0      291
88.0      266
90.0      222
92.0      190
94.0      134
96.0      125
98.0       83
Name: count, dtype: int64


In [73]:
# print how many have null values
print(data['Age'].isnull().sum())

149385


In [74]:
import pandas as pd
from sklearn.cluster import KMeans

# Assuming 'data' is your DataFrame
# Use k-means clustering to fill in missing values for the 'Age' column

# Get the indices of rows with missing 'Age' values
missingAgeIndices = data[data['Age'].isnull()].index

# Get the indices of rows with non-missing 'Age' values
nonMissingAgeIndices = data[data['Age'].notnull()].index

# Create a k-means model with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0)

# Fit the model on the non-missing 'Age' values
kmeans.fit(data.loc[nonMissingAgeIndices, ['Age']])

# Predict the clusters for all 'Age' values
ageClusters = kmeans.predict(data[['Age']].fillna(0))

# Get the cluster centers
clusterCenters = kmeans.cluster_centers_

# Create a DataFrame to map clusters to their centers
clusterMap = pd.DataFrame(clusterCenters, columns=['Age'])

# Assign the cluster center values to the missing 'Age' values
for idx in missingAgeIndices:
    cluster = ageClusters[idx]
    data.at[idx, 'Age'] = clusterMap.at[cluster, 'Age']

# Now 'data' has the missing 'Age' values filled in using k-means clustering

  super()._check_params_vs_input(X, default_n_init=10)


In [75]:
# print how many have null values
print(data['Age'].isnull().sum())

0


<h5>DONE!</h5>

<h5>Household Estimate Income</h5>

In [76]:
# call binConvert function from Helpers.py
data = binConvert(data, 'HouseholdIncome')

In [77]:
import importlib
import Helpers
importlib.reload(Helpers)

<module 'Helpers' from '/home/jbrinkm/IllitchML/Helpers.py'>

In [78]:
# validate the conversion
print(getConvertedValues(data, 'HouseholdIncome'))

Index(['HouseholdIncome_7500099999', 'HouseholdIncome_Unknown',
       'HouseholdIncome_5500059999', 'HouseholdIncome_6500074999',
       'HouseholdIncome_5000054999', 'HouseholdIncome_1500019999',
       'HouseholdIncome_3500039999', 'HouseholdIncome_100000149999',
       'HouseholdIncome_3000034999', 'HouseholdIncome_6000064999',
       'HouseholdIncome_2000024999', 'HouseholdIncome_4500049999',
       'HouseholdIncome_250000', 'HouseholdIncome_175000199999',
       'HouseholdIncome_2500029999', 'HouseholdIncome_4000044999',
       'HouseholdIncome_150000174999', 'HouseholdIncome_200000249999',
       'HouseholdIncome_Under10000', 'HouseholdIncome_1000014999'],
      dtype='object')


<h5>Net Worth</h5>

In [79]:
# call binConvert function from Helpers.py
data = binConvert(data, 'NetWorth')


In [80]:
# validate the conversion
print(getConvertedValues(data, 'NetWorth'))

Index(['NetWorth_Greaterthan1999999', 'NetWorth_Unknown',
       'NetWorth_500000999999', 'NetWorth_10000001999999',
       'NetWorth_5000099999', 'NetWorth_250000499999', 'NetWorth_100000249999',
       'NetWorth_1000024999', 'NetWorth_2500049999', 'NetWorth_Lessthan1',
       'NetWorth_14999', 'NetWorth_50009999'],
      dtype='object')


<h5>CurrentSeasonEmailActivities</h5>

In [81]:
print(data['CurrentSeasonEmailActivities'].unique())
print(data['CurrentSeasonEmailActivities'])

[ 15.  nan   1.  12.  20.   0.  39.  19.  41.  21.   4.  24.  11.  31.
  49.  26.   2.  48.   5.  29.  23.  33.  47.  50.  36.  13.   3. 101.
  46.   7.  45.  37.  38.  34.  10.  43.  16.  40.  51.  53.   8.  55.
  35.   9. 100. 191.  58.  28.  56.  27.  44.  18.  30.  25.   6. 103.
  52.  22.  54.  69.  64.  17.  14.  61.  96.  78.  42.  32.  71.  93.
  60. 104.  98.  92.  74. 102.  62.  99.  66.  59.  97.  95.  63.  73.
  65.  57.  89. 135.  94. 110.  82.  87. 137.  75.  80.  72.  70. 109.
 161.  91. 120.  76. 107.  68.  90. 105. 166.  77.  79.  67. 112.  88.
  86. 139. 117. 118. 106.  84. 115. 108. 140.  83. 134.  81. 178.  85.
 149. 127. 116. 143. 129. 122. 123. 111. 154. 138. 131. 124. 128. 126.
 175. 114. 130. 150. 141. 192. 132. 176. 142. 119. 136. 162. 180. 153.
 177. 184. 151. 198. 186. 144. 164. 113. 158. 160. 121. 133. 157. 188.
 173. 196. 146.]
0         15.0
1          NaN
2          1.0
3         12.0
4         20.0
          ... 
413217     NaN
413218     NaN
413219     

Doing nearest neighbor estimate for this

In [82]:
modData = nearestNeighborEstimate(data, ['CurrentSeasonEmailActivities'])
# assert no null values in the column
assert modData['CurrentSeasonEmailActivities'].isnull().sum() == 0
data = modData


In [83]:
print(data['FanSinceDate'].unique())
print(type(data['FanSinceDate']))
# convert to datetime
data['FanSinceDate'] = pd.to_datetime(data['FanSinceDate'], errors='coerce')
print(type(data['FanSinceDate']))
# check for null values
print(data['FanSinceDate'].isnull().sum())
allFanDates = data['FanSinceDate']
print (allFanDates)

['5/1/2009' '2/25/2009' '8/5/2008' ... '2/3/2025' '2/8/2025' '2/9/2025']
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
0
0        2009-05-01
1        2009-02-25
2        2008-08-05
3        2007-06-24
4        2008-08-05
            ...    
413217   2025-03-26
413218   2025-03-26
413219   2025-03-26
413220   2025-03-26
413221   2025-03-26
Name: FanSinceDate, Length: 413222, dtype: datetime64[ns]


In [84]:
#reload the Helpers module
import importlib
import Helpers
importlib.reload(Helpers)

<module 'Helpers' from '/home/jbrinkm/IllitchML/Helpers.py'>

In [85]:
data=convertDateToDays(data, 'FanSinceDate')

In [86]:
print(data['FanSinceDate_totalDays'])
# check for null values
print(data['FanSinceDate_totalDays'].isnull().sum())

0         5813
1         5878
2         6082
3         6490
4         6082
          ... 
413217       5
413218       5
413219       5
413220       5
413221       5
Name: FanSinceDate_totalDays, Length: 413222, dtype: int64
0


In [90]:
data = binConvert(data, 'GroupDescription')
# validate the conversion
print(getConvertedValues(data, 'GroupDescription'))

Index(['GroupDescription_affluenthouseholds', 'GroupDescription_Unknown',
       'GroupDescription_careeroriented', 'GroupDescription_topwealth',
       'GroupDescription_settlingdown', 'GroupDescription_livingwell',
       'GroupDescription_comfortablehouseholds',
       'GroupDescription_workinghouseholds', 'GroupDescription_solidprestige',
       'GroupDescription_largehouseholds', 'GroupDescription_busyhouseholds',
       'GroupDescription_thriftyandactive', 'GroupDescription_takinghold',
       'GroupDescription_socialconnectors', 'GroupDescription_bargainhunters',
       'GroupDescription_leisureseekers', 'GroupDescription_startingout',
       'GroupDescription_divergingpaths', 'GroupDescription_ruralmetromix',
       'GroupDescription_communityminded',
       'GroupDescription_comfortableindependence',
       'GroupDescription_workingandstudying'],
      dtype='object')


In [None]:
# HigherSpenderFlag change all null values to 0
data['HighSpenderFlg'].fillna(0, inplace=True)

# columns to change nulls_to_zero
nulls_to_zero = [
    'FullSeasonBuyer', 'HalfSeasonBuyer', 'QuarterSeasonBuyer',
    'MiniPlanBuyer', 'IndividualGameBuyer', 'FamilyEventFlg',
    'TravelEntertainmentCardHolderFlg', ]

# all columns ending in Flg should have nulls changed to 0
for col in data.columns:
    if col.endswith('Flg'):
        nulls_to_zero.append(col)
# change nulls to zero
for col in nulls_to_zero:
    data[col].fillna(0, inplace=True)

In [100]:
print(data[nulls_to_zero].isnull().sum())

FullSeasonBuyer                        0
HalfSeasonBuyer                        0
QuarterSeasonBuyer                     0
MiniPlanBuyer                          0
IndividualGameBuyer                    0
FamilyEventFlg                         0
TravelEntertainmentCardHolderFlg       0
MailSuppresionFlg                      0
WorkingWomanFlg                        0
BankCardHolderFlg                      0
GasDepartmentRetailCardHolderFlg       0
TravelEntertainmentCardHolderFlg       0
CreditCardHolderUnknownTypeFlg         0
PremiumCardHolderFlg                   0
UpscaleDepartmentStoreCardHolderFlg    0
MailOrderResponderFlg                  0
TruckOwnerFlg                          0
MotorcycleOwnerFlg                     0
RVOwnerFlg                             0
IntTheatrePerformingArtsFlg            0
IntArtsFlg                             0
IntDomesticTravelFlg                   0
IntHomeStereosFlg                      0
IntMusicDevicesFlg                     0
IntMusicAvidList

In [102]:
# pint out all columns with null values
null_columns = data.columns[data.isnull().any()].tolist()
print("Columns with null values:")
print(null_columns)
print(data[null_columns].isnull().sum())

Columns with null values:
['FirstGameAttended', 'TotalGamesAttended', 'FirstGameBought', 'LastGameBought', 'TotalTicketsPurchased', 'TotalLifetimeValue', 'PreviousSeasonsEmailActivities', 'City', 'PostalCd', 'Country', 'Education', 'Occupation', 'MaritalStatus', 'PresenceOfChildren', 'DwellingType', 'PrimaryVehicleType', 'MSADescription', 'DistanceToArena', 'NumberOfAdultsInHousehold', 'LengthOfResidence', 'DiscretionaryIncomeIndex', 'FinancialScore', 'NumberOfVehicles', 'Male18to24', 'Female18to24', 'Male25to34', 'Female25to34', 'Male35to44', 'Female35to44', 'Male45to54', 'Female45to54', 'Male55to65', 'Female55to64', 'Male65to75', 'Female65to74', 'Male75up', 'Female75up', 'ClientFirstSaleDateKey', 'TotalEvents', 'AvgTicketsPerEvent', 'TotalSpend', 'AvgSpendPerEvent', 'LowestTicketPrice', 'HighestTicketPrice', 'AvgDistanceTraveledLocal', 'SpendPerEventConcerts', 'SpendPerEventArts', 'SpendPerEventSports', 'SpendPerEventFamily', 'ClusterCode', 'ClusterDescription', 'GroupCode', 'ModelCo

In [104]:
# make sure flag columns are of type int
flag_columns = [col for col in data.columns if col.endswith('Flg')]
for col in flag_columns:
    data[col] = data[col].astype(int)
# check the types of all columns
column_types = data.dtypes
for column, dtype in column_types.items():
    print(f"{column}: {dtype}")

KeepFlag: int64
GlobalKey: int64
SeasonKey: int64
FirstGameAttended: object
TotalGamesAttended: float64
FirstGameBought: object
LastGameBought: object
TotalTicketsPurchased: float64
TotalLifetimeValue: float64
CurrentSeasonEmailActivities: float64
PreviousSeasonsEmailActivities: float64
STMFlagCurr: int64
TicketingFanType: object
EmailFanType: object
FullSeasonBuyer: float64
HalfSeasonBuyer: float64
QuarterSeasonBuyer: float64
MiniPlanBuyer: float64
IndividualGameBuyer: float64
City: object
State: object
PostalCd: object
Country: object
Education: object
Occupation: object
Age: float64
MaritalStatus: object
PresenceOfChildren: object
DwellingType: object
PrimaryVehicleType: object
MSADescription: object
MailSuppresionFlg: int64
WorkingWomanFlg: int64
BankCardHolderFlg: int64
GasDepartmentRetailCardHolderFlg: int64
TravelEntertainmentCardHolderFlg: int64
CreditCardHolderUnknownTypeFlg: int64
PremiumCardHolderFlg: int64
UpscaleDepartmentStoreCardHolderFlg: int64
MailOrderResponderFlg: in

In [None]:
# drop shit columns
shittyCols = ['ClusterDescription', 'ClusterCode', 'ClientWalkUpBuyerFlg',
              'PrimaryVehicleType', ]

In [None]:
# Check for NaN values in the 'ClusterCode' column and count them
na_count = data['ClusterCode'].isna().sum()

# Print the count of NaN values
print(f"Number of NaN values in 'ClusterCode': {na_count}")

# print unique values
print(data['ClusterCode'].unique())



Number of NaN values in 'ClusterCode': 72093
[ 1. nan 26.  3. 34. 23. 17. 38.  5. 12. 13. 37.  6.  0.  2. 57.  4.  8.
 14. 40. 30. 11. 19. 59. 44. 62. 51.  7. 16. 31. 10. 47. 43. 25.  9. 27.
 35. 20. 15. 22. 36. 33. 39. 24. 63. 55. 28. 21. 60. 42. 49. 48. 53. 46.
 52. 45. 50. 54. 32. 69. 18. 41. 56. 68. 64. 71. 66. 29. 65. 61. 67. 58.
 70. 72.]


<h1>Save work to TigersCleaned.csv</h1>

In [87]:
#save work to a new csv file
data.to_csv('data/TigersCleaned.csv', index=False)

<h5>Cluster Analysis w/5 clusters</h5>

In [88]:
# print all columns with NaN values
nan_columns = data.columns[data.isnull().any()]
# print the name of the columns with NaN values
for nanCol in nan_columns:
    print(nanCol)

FirstGameAttended
TotalGamesAttended
FirstGameBought
LastGameBought
TotalTicketsPurchased
TotalLifetimeValue
PreviousSeasonsEmailActivities
FullSeasonBuyer
HalfSeasonBuyer
QuarterSeasonBuyer
MiniPlanBuyer
IndividualGameBuyer
City
PostalCd
Country
Education
Occupation
MaritalStatus
PresenceOfChildren
DwellingType
PrimaryVehicleType
MSADescription
MailSuppresionFlg
WorkingWomanFlg
BankCardHolderFlg
GasDepartmentRetailCardHolderFlg
TravelEntertainmentCardHolderFlg
CreditCardHolderUnknownTypeFlg
PremiumCardHolderFlg
UpscaleDepartmentStoreCardHolderFlg
MailOrderResponderFlg
TruckOwnerFlg
MotorcycleOwnerFlg
RVOwnerFlg
IntTheatrePerformingArtsFlg
IntArtsFlg
IntDomesticTravelFlg
IntHomeStereosFlg
IntMusicDevicesFlg
IntMusicAvidListenerFlg
IntMusicCollectorFlg
IntMovieCollectorFlg
IntAutoRacingFlg
IntFootballFlg
IntBaseballFlg
IntBasketballFlg
IntHockeyFlg
IntContestsFlg
IntSportsFlg
IntMusicMoviesFlg
IntNascarFlg
IntUpscaleLivingFlg
DistanceToArena
NumberOfAdultsInHousehold
LengthOfResidence
D

In [89]:
import pandas as pd
from sklearn.mixture import GaussianMixture

# Assuming 'data' is your DataFrame

# Create a Gaussian Mixture Model with 5 components (clusters)
gmm = GaussianMixture(n_components=5, random_state=0)

# Fit the model on the entire dataset (excluding any non-numeric columns)
gmm.fit(data_clean.select_dtypes(include=[float, int]))

# Predict the clusters for all data points
data_clean['Cluster'] = gmm.predict(data_clean.select_dtypes(include=[float, int]))

# Get the cluster centers (means of the Gaussian components)
clusterCenters = gmm.means_

# Create a DataFrame to map clusters to their centers
clusterMap = pd.DataFrame(clusterCenters, columns=data_clean.select_dtypes(include=[float, int]).columns)

# Now 'data' has an additional 'Cluster' column indicating the cluster each data point belongs to

NameError: name 'data_clean' is not defined