<h1>Notebook to clean the Tigers Data Set</h1>
<p>data/DTIFanData_2-21-25.csv will be cleaned</p>

<h3>Dependencies</h3>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# import Helpers from Helpers.py which is in same directory
from Helpers import *

<h3>Load the dataset</h3>

In [3]:
path = 'data/DRWFanData_2-21-25.csv'
# Load the data
data = pd.read_csv(path)
# Display the first few rows of the dataframe
print(data.head())

  data = pd.read_csv(path)


   KeepFlag  GlobalKey  SeasonKey FanSinceDate  \
0         1          9       2024   2021-10-18   
1         1         15       2024   2017-11-25   
2         1         17       2024   2015-12-04   
3         1         22       2024   2022-02-17   
4         1         25       2024   2008-02-25   

                  FirstGameAttended  TotalGamesAttended  \
0  2021-10-19 Columbus Blue Jackets                23.0   
1      2017-11-25 New Jersey Devils                 6.0   
2         2021-11-27 Buffalo Sabres                 4.0   
3         2022-03-10 Minnesota Wild                 1.0   
4        2024-10-27 Edmonton Oilers                 1.0   

                FirstGameBought                  LastGameBought  \
0                           NaN                             NaN   
1  2017-11-25 New Jersey Devils        2017-12-13 Boston Bruins   
2     2024-04-07 Buffalo Sabres  2024-12-18 Philadelphia Flyers   
3                           NaN                             NaN   
4    2024

In [4]:
# Function to check for type errors in columns
def check_type_errors(df):
    type_errors = {}
    for column in df.columns:
        try:
            df[column].astype(df[column].dtype)
        except ValueError as e:
            type_errors[column] = str(e)
    return type_errors

# Check for type errors in the data
type_errors = check_type_errors(data)
print("Columns with type errors:", type_errors)

Columns with type errors: {}


In [5]:
# Get the data types of all columns
column_types = data.dtypes
for column, dtype in column_types.items():
    print(f"{column}: {dtype}")

KeepFlag: int64
GlobalKey: int64
SeasonKey: int64
FanSinceDate: object
FirstGameAttended: object
TotalGamesAttended: float64
FirstGameBought: object
LastGameBought: object
TotalTicketsPurchased: float64
TotalLifetimeValue: float64
CurrentSeasonEmailActivities: float64
PreviousSeasonsEmailActivities: float64
STMFlagCurr: int64
TicketingFanType: object
EmailFanType: object
FullSeasonBuyer: float64
HalfSeasonBuyer: float64
QuarterSeasonBuyer: float64
MiniPlanBuyer: float64
IndividualGameBuyer: float64
City: object
State: object
PostalCd: object
Country: object
Gender: object
Education: object
Occupation: object
Age: float64
MaritalStatus: object
PresenceOfChildren: object
DwellingType: object
HouseholdIncome: object
NetWorth: object
PrimaryVehicleType: object
MSADescription: object
MailSuppresionFlg: float64
WorkingWomanFlg: float64
BankCardHolderFlg: float64
GasDepartmentRetailCardHolderFlg: float64
TravelEntertainmentCardHolderFlg: float64
CreditCardHolderUnknownTypeFlg: float64
Premium

<h3>Cleaning State Values</h3>
<p>Keep states, provinces, and territories in US and Canada</p>
<p>Other locations will be put in 'OTHERS' category

In [6]:
state_counts = data['State'].value_counts()
state_counts = state_counts.reset_index()
state_counts.columns = ['State', 'Count']
print(state_counts.to_string())

         State  Count
0           MI  73389
1           ON   7463
2           OH   3118
3           IL   1077
4           IN    960
5        MICHI    859
6           CA    845
7           FL    822
8           NY    642
9           TX    562
10          PA    388
11          WI    386
12          NC    351
13          VA    343
14          CO    343
15          TN    308
16          GA    275
17          MA    262
18          WA    237
19          AZ    227
20          KY    211
21          MN    190
22          NJ    184
23          MD    178
24       Ontar    161
25          MO    156
26          SC    150
27          AB    150
28          QC    134
29          NV    130
30          Mi    114
31          IA    108
32          AL     98
33          CT     96
34          OR     94
35          NL     92
36          BC     87
37          AK     69
38          NM     68
39          NE     64
40          UT     60
41          KS     57
42          MT     53
43          OK     52
44        

In [7]:
# Mapping dictionary for state normalization
state_mapping = {
    'MI': 'MICHIGAN',
    'OH': 'OHIO',
    'IL': 'ILLINOIS',
    'IN': 'INDIANA',
    'WI': 'WISCONSIN',
    'MN': 'MINNESOTA',
    'IA': 'IOWA',
    'KY': 'KENTUCKY',
    'MO': 'MISSOURI',
    'PA': 'PENNSYLVANIA',
    'NY': 'NEW YORK',
    'NJ': 'NEW JERSEY',
    'CT': 'CONNECTICUT',
    'MD': 'MARYLAND',
    'VA': 'VIRGINIA',
    'NC': 'NORTH CAROLINA',
    'SC': 'SOUTH CAROLINA',
    'GA': 'GEORGIA',
    'FL': 'FLORIDA',
    'TX': 'TEXAS',
    'CA': 'CALIFORNIA',
    'WA': 'WASHINGTON',
    'OR': 'OREGON',
    'CO': 'COLORADO',
    'AZ': 'ARIZONA',
    'NV': 'NEVADA',
    'UT': 'UTAH',
    'ID': 'IDAHO',
    'MT': 'MONTANA',
    'WY': 'WYOMING',
    'ND': 'NORTH DAKOTA',
    'SD': 'SOUTH DAKOTA',
    'NE': 'NEBRASKA',
    'KS': 'KANSAS',
    'LA': 'LOUISIANA',
    'AR': 'ARKANSAS',
    'MS': 'MISSISSIPPI',
    'AL': 'ALABAMA',
    'TN': 'TENNESSEE',
    'WV': 'WEST VIRGINIA',
    'DE': 'DELAWARE',
    'VT': 'VERMONT',
    'NH': 'NEW HAMPSHIRE',
    'ME': 'MAINE',
    'HI': 'HAWAII',
    'AK': 'ALASKA',
    'NM': 'NEW MEXICO',
    'OK': 'OKLAHOMA',
    'MA': 'MASSACHUSETTS',
    'RI': 'RHODE ISLAND',
    # Non-state territories and regions
    'PR': 'PUERTO RICO',
    'DC': 'DISTRICT OF COLUMBIA',
    'AS': 'AMERICAN SAMOA',
    'GU': 'GUAM',
    'MP': 'NORTHERN MARIANA ISLANDS',
    'VI': 'VIRGIN ISLANDS',
    'FM': 'FEDERATED STATES OF MICRONESIA',
    'MH': 'MARSHALL ISLANDS',
    # International regions (if applicable)
    'OT': 'ONTARIO',  # Example for Ontario, Canada
    'ON': 'ONTARIO',  # Example for Ontario, Canada
    'Ontario': 'ONTARIO',  # Example for Ontario, Canada
    'NS': 'NOVA SCOTIA',  # Example for Nova Scotia, Canada
    'QC': 'QUEBEC',  # Example for Quebec, Canada
    'BC': 'BRITISH COLUMBIA',  # Example for British Columbia, Canada
    'AB': 'ALBERTA',  # Example for Alberta, Canada
    'SK': 'SASKATCHEWAN',  # Example for Saskatchewan, Canada
    'NL': 'NEWFOUNDLAND AND LABRADOR',  # Example for Newfoundland and Labrador, Canada
    'YT': 'YUKON',  # Example for Yukon, Canada
    'NT': 'NORTHWEST TERRITORIES',  # Example for Northwest Territories, Canada
    'NU': 'NUNAVUT',  # Example for Nunavut, Canada
    'MB': 'MANITOBA',  # Example for Manitoba, Canada
    'PE': 'PRINCE EDWARD ISLAND',  # Example for Prince Edward Island, Canada
    'NB': 'NEW BRUNSWICK'  # Example for New Brunswick, Canada
}

# Function to normalize state values
def normalize_state(state):
    state = state.strip().upper()
    return state_mapping.get(state, state)

# Apply the normalization function to the 'State' column
data['State'] = data['State'].apply(lambda x: normalize_state(x) if pd.notnull(x) else x)

# Display the normalized state counts
normalized_state_counts = data['State'].value_counts().reset_index()
normalized_state_counts.columns = ['State', 'Count']
print(normalized_state_counts.to_string())

                              State  Count
0                          MICHIGAN  73524
1                           ONTARIO   7481
2                              OHIO   3174
3                          ILLINOIS   1081
4                           INDIANA    961
5                             MICHI    886
6                        CALIFORNIA    848
7                           FLORIDA    828
8                          NEW YORK    642
9                             TEXAS    566
10                     PENNSYLVANIA    388
11                        WISCONSIN    388
12                   NORTH CAROLINA    351
13                         COLORADO    345
14                         VIRGINIA    343
15                        TENNESSEE    308
16                          GEORGIA    275
17                    MASSACHUSETTS    262
18                       WASHINGTON    237
19                          ARIZONA    228
20                         KENTUCKY    212
21                        MINNESOTA    190
22         

In [8]:
# Get the counts of each state
state_counts = data['State'].value_counts()

# Function to update state values based on their occurrences
def update_state(state):
    if state not in state_mapping.values():
        return 'OTHERS'
    return state

# Function to group all values that are null as NULL
def update_Nullstate(state):
    if pd.isnull(state):
        return 'NULL'
    return state

# Apply the update function to the 'State' column
data['State'] = data['State'].apply(lambda x: update_state(x) if pd.notnull(x) else x)
data['State'] = data['State'].apply(lambda x: update_Nullstate(x))

# Display the updated state counts
updated_state_counts = data['State'].value_counts().reset_index()
updated_state_counts.columns = ['State', 'Count']
print(updated_state_counts.to_string())

                             State   Count
0                             NULL  412524
1                         MICHIGAN   73524
2                          ONTARIO    7481
3                             OHIO    3174
4                           OTHERS    1329
5                         ILLINOIS    1081
6                          INDIANA     961
7                       CALIFORNIA     848
8                          FLORIDA     828
9                         NEW YORK     642
10                           TEXAS     566
11                    PENNSYLVANIA     388
12                       WISCONSIN     388
13                  NORTH CAROLINA     351
14                        COLORADO     345
15                        VIRGINIA     343
16                       TENNESSEE     308
17                         GEORGIA     275
18                   MASSACHUSETTS     262
19                      WASHINGTON     237
20                         ARIZONA     228
21                        KENTUCKY     212
22         

<h3>Redo sections that analysts did to keep global key present</h3>

<h5>Gender</h5>

In [9]:
#Will create three columns: isMale, isFemale, isNaN based on gender column
#Print values in Gender column
print(data['Gender'].value_counts())
#Create new columns
data['isMale'] = data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
data['isFemale'] = data['Gender'].apply(lambda x: 1 if x == 'Female' else 0)
#isNan should have all that are not 'Male' or 'Female' or is Null
data['isNaN'] = data['Gender'].apply(lambda x: 1 if pd.isnull(x) else 0)
# Find about 10,000 more non NaN and not Male or Female and lump them into isNaN
non_male_female = data[(data['Gender'] != 'Male') & (data['Gender'] != 'Female') & (pd.notnull(data['Gender']))]
data.loc[non_male_female.index, 'isNaN'] = 1

print(data[['isMale', 'isFemale', 'isNaN']].sum())

# confirm no overlap between isMale, isFemale, and isNaN
# Confirm no overlap between isMale, isFemale, and isNaN
assert data[['isMale', 'isFemale', 'isNaN']].sum().sum() == data.shape[0]

# Ensure the sum of all the columns values equals the number of rows
assert data[['isMale', 'isFemale', 'isNaN']].sum(axis=1).eq(1).all()

Gender
<Unknown>    177536
Male         148787
Female        97605
Name: count, dtype: int64
isMale      148787
isFemale     97605
isNaN       262799
dtype: int64


In [10]:
# Drop the gender column
data.drop(columns=['Gender'], inplace=True)

<h5>Age Redo</h5>

In [11]:
#print columns list
column_types = data.dtypes
for column, dtype in column_types.items():
    print(f"{column}: {dtype}")

KeepFlag: int64
GlobalKey: int64
SeasonKey: int64
FanSinceDate: object
FirstGameAttended: object
TotalGamesAttended: float64
FirstGameBought: object
LastGameBought: object
TotalTicketsPurchased: float64
TotalLifetimeValue: float64
CurrentSeasonEmailActivities: float64
PreviousSeasonsEmailActivities: float64
STMFlagCurr: int64
TicketingFanType: object
EmailFanType: object
FullSeasonBuyer: float64
HalfSeasonBuyer: float64
QuarterSeasonBuyer: float64
MiniPlanBuyer: float64
IndividualGameBuyer: float64
City: object
State: object
PostalCd: object
Country: object
Education: object
Occupation: object
Age: float64
MaritalStatus: object
PresenceOfChildren: object
DwellingType: object
HouseholdIncome: object
NetWorth: object
PrimaryVehicleType: object
MSADescription: object
MailSuppresionFlg: float64
WorkingWomanFlg: float64
BankCardHolderFlg: float64
GasDepartmentRetailCardHolderFlg: float64
TravelEntertainmentCardHolderFlg: float64
CreditCardHolderUnknownTypeFlg: float64
PremiumCardHolderFlg: 

In [12]:
# print values of data['Age']
print(data['Age'].value_counts())

Age
30.0    13183
32.0    13019
28.0    12996
26.0    12040
34.0    11793
36.0    11397
38.0    10879
40.0    10500
52.0    10393
42.0    10249
24.0    10060
54.0     9788
50.0     9630
44.0     9499
48.0     9277
46.0     9237
56.0     9046
58.0     8682
60.0     7919
22.0     7574
62.0     7265
64.0     6063
66.0     4893
20.0     4278
68.0     3917
70.0     3047
72.0     2252
18.0     1811
74.0     1782
76.0     1316
78.0      881
80.0      702
82.0      461
84.0      357
86.0      257
99.0      213
88.0      197
90.0      173
92.0      134
94.0      128
96.0      100
98.0       65
Name: count, dtype: int64


In [13]:
# print how many have null values
print(data['Age'].isnull().sum())

261738


In [14]:
import pandas as pd
from sklearn.cluster import KMeans

# Assuming 'data' is your DataFrame
# Use k-means clustering to fill in missing values for the 'Age' column

# Get the indices of rows with missing 'Age' values
missingAgeIndices = data[data['Age'].isnull()].index

# Get the indices of rows with non-missing 'Age' values
nonMissingAgeIndices = data[data['Age'].notnull()].index

# Create a k-means model with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0)

# Fit the model on the non-missing 'Age' values
kmeans.fit(data.loc[nonMissingAgeIndices, ['Age']])

# Predict the clusters for all 'Age' values
ageClusters = kmeans.predict(data[['Age']].fillna(0))

# Get the cluster centers
clusterCenters = kmeans.cluster_centers_

# Create a DataFrame to map clusters to their centers
clusterMap = pd.DataFrame(clusterCenters, columns=['Age'])

# Assign the cluster center values to the missing 'Age' values
for idx in missingAgeIndices:
    cluster = ageClusters[idx]
    data.at[idx, 'Age'] = clusterMap.at[cluster, 'Age']

# Now 'data' has the missing 'Age' values filled in using k-means clustering

  super()._check_params_vs_input(X, default_n_init=10)


In [15]:
# print how many have null values
print(data['Age'].isnull().sum())

0


<h5>DONE!</h5>

<h5>Household Estimate Income</h5>

In [16]:
# call binConvert function from Helpers.py
data = binConvert(data, 'HouseholdIncome')

In [17]:
import importlib
import Helpers
importlib.reload(Helpers)

<module 'Helpers' from '/home/jbrinkm/IllitchML/Helpers.py'>

In [18]:
# validate the conversion
print(getConvertedValues(data, 'HouseholdIncome'))

Index(['HouseholdIncome_250000', 'HouseholdIncome_Unknown',
       'HouseholdIncome_7500099999', 'HouseholdIncome_6500074999',
       'HouseholdIncome_100000149999', 'HouseholdIncome_5500059999',
       'HouseholdIncome_175000199999', 'HouseholdIncome_3500039999',
       'HouseholdIncome_2500029999', 'HouseholdIncome_4500049999',
       'HouseholdIncome_1500019999', 'HouseholdIncome_2000024999',
       'HouseholdIncome_6000064999', 'HouseholdIncome_5000054999',
       'HouseholdIncome_200000249999', 'HouseholdIncome_150000174999',
       'HouseholdIncome_3000034999', 'HouseholdIncome_4000044999',
       'HouseholdIncome_1000014999', 'HouseholdIncome_Under10000'],
      dtype='object')


<h5>Net Worth</h5>

In [19]:
# call binConvert function from Helpers.py
data = binConvert(data, 'NetWorth')


In [20]:
# validate the conversion
print(getConvertedValues(data, 'NetWorth'))

Index(['NetWorth_250000499999', 'NetWorth_Unknown', 'NetWorth_5000099999',
       'NetWorth_10000001999999', 'NetWorth_500000999999',
       'NetWorth_Greaterthan1999999', 'NetWorth_100000249999',
       'NetWorth_1000024999', 'NetWorth_50009999', 'NetWorth_2500049999',
       'NetWorth_Lessthan1', 'NetWorth_14999'],
      dtype='object')


<h5>CurrentSeasonEmailActivities</h5>

In [21]:
print(data['CurrentSeasonEmailActivities'].unique())
print(data['CurrentSeasonEmailActivities'])

[1.00e+00 8.00e+00 3.62e+02 7.00e+00      nan 4.00e+00 2.00e+00 5.50e+01
 2.30e+01 6.00e+00 0.00e+00 9.00e+00 3.00e+00 1.05e+02 1.08e+02 1.10e+01
 2.80e+01 2.90e+01 9.50e+01 6.00e+01 1.20e+01 2.10e+01 1.50e+01 1.80e+01
 4.40e+01 7.50e+01 1.40e+01 3.40e+01 6.40e+01 4.20e+01 1.00e+01 6.70e+01
 2.40e+01 4.70e+01 1.47e+02 5.00e+00 3.60e+01 7.70e+01 2.75e+02 4.00e+01
 1.30e+01 3.30e+01 1.79e+02 1.10e+02 1.50e+02 2.50e+01 2.00e+01 1.29e+02
 3.50e+01 7.00e+01 4.80e+01 2.20e+02 2.20e+01 2.70e+01 1.63e+02 1.90e+01
 3.00e+01 2.07e+02 8.50e+01 1.27e+02 1.59e+02 6.20e+01 2.60e+01 1.00e+02
 4.90e+01 1.60e+01 5.00e+01 8.20e+01 5.60e+01 3.90e+01 1.93e+02 3.20e+01
 6.30e+01 1.62e+02 4.60e+01 6.50e+01 1.55e+02 1.04e+02 3.70e+01 2.34e+02
 1.23e+02 1.39e+02 5.20e+01 1.98e+02 6.10e+01 2.57e+02 1.41e+02 1.88e+02
 1.95e+02 5.30e+01 4.10e+01 1.70e+02 1.31e+02 3.10e+01 1.70e+01 2.12e+02
 8.30e+01 8.90e+01 1.77e+02 2.22e+02 1.78e+02 1.30e+02 1.84e+02 2.60e+02
 2.94e+02 9.10e+01 6.90e+01 1.14e+02 4.30e+01 2.37e

Doing nearest neighbor estimate for this

In [22]:
modData = nearestNeighborEstimate(data, ['CurrentSeasonEmailActivities'])
# assert no null values in the column
assert modData['CurrentSeasonEmailActivities'].isnull().sum() == 0
data = modData


KeyboardInterrupt: 

In [23]:
print(data['FanSinceDate'].unique())
print(type(data['FanSinceDate']))
# convert to datetime
data['FanSinceDate'] = pd.to_datetime(data['FanSinceDate'], errors='coerce')
print(type(data['FanSinceDate']))
# check for null values
print(data['FanSinceDate'].isnull().sum())
allFanDates = data['FanSinceDate']
print (allFanDates)

['2021-10-18' '2017-11-25' '2015-12-04' ... '2024-05-24' '2024-05-30'
 '2024-06-08']
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
0
0        2021-10-18
1        2017-11-25
2        2015-12-04
3        2022-02-17
4        2008-02-25
            ...    
509186   2025-02-20
509187   2025-02-20
509188   2025-02-20
509189   2025-02-20
509190   2025-02-20
Name: FanSinceDate, Length: 509191, dtype: datetime64[ns]


In [None]:
#reload the Helpers module
import importlib
import Helpers
importlib.reload(Helpers)

<module 'Helpers' from '/home/jbrinkm/IllitchML/Helpers.py'>

In [24]:
data=convertDateToDays(data, 'FanSinceDate')

In [25]:
print(data['FanSinceDate_totalDays'])
# check for null values
print(data['FanSinceDate_totalDays'].isnull().sum())

0         1260
1         2683
2         3405
3         1138
4         6244
          ... 
509186      39
509187      39
509188      39
509189      39
509190      39
Name: FanSinceDate_totalDays, Length: 509191, dtype: int64
0


In [26]:
data = binConvert(data, 'GroupDescription')
# validate the conversion
print(getConvertedValues(data, 'GroupDescription'))

Index(['GroupDescription_affluenthouseholds', 'GroupDescription_Unknown',
       'GroupDescription_comfortablehouseholds', 'GroupDescription_topwealth',
       'GroupDescription_takinghold', 'GroupDescription_solidprestige',
       'GroupDescription_largehouseholds', 'GroupDescription_communityminded',
       'GroupDescription_ruralmetromix', 'GroupDescription_startingout',
       'GroupDescription_divergingpaths', 'GroupDescription_thriftyandactive',
       'GroupDescription_livingwell', 'GroupDescription_leisureseekers',
       'GroupDescription_bargainhunters', 'GroupDescription_workinghouseholds',
       'GroupDescription_careeroriented',
       'GroupDescription_comfortableindependence',
       'GroupDescription_busyhouseholds', 'GroupDescription_settlingdown',
       'GroupDescription_workingandstudying',
       'GroupDescription_socialconnectors'],
      dtype='object')


In [30]:
# HigherSpenderFlag change all null values to 0
data['HighSpenderFlg'].fillna(0, inplace=True)

# columns to change nulls_to_zero
nulls_to_zero = [
    'FullSeasonBuyer', 'HalfSeasonBuyer', 'QuarterSeasonBuyer',
    'MiniPlanBuyer', 'IndividualGameBuyer', 'FamilyEventFlg',
    'TravelEntertainmentCardHolderFlg', ]

# all columns ending in Flg should have nulls changed to 0
for col in data.columns:
    if col.endswith('Flg'):
        nulls_to_zero.append(col)
# change nulls to zero
for col in nulls_to_zero:
    data[col].fillna(0, inplace=True)

In [31]:
print(data[nulls_to_zero].isnull().sum())

FullSeasonBuyer                        0
HalfSeasonBuyer                        0
QuarterSeasonBuyer                     0
MiniPlanBuyer                          0
IndividualGameBuyer                    0
FamilyEventFlg                         0
TravelEntertainmentCardHolderFlg       0
MailSuppresionFlg                      0
WorkingWomanFlg                        0
BankCardHolderFlg                      0
GasDepartmentRetailCardHolderFlg       0
TravelEntertainmentCardHolderFlg       0
CreditCardHolderUnknownTypeFlg         0
PremiumCardHolderFlg                   0
UpscaleDepartmentStoreCardHolderFlg    0
MailOrderResponderFlg                  0
TruckOwnerFlg                          0
MotorcycleOwnerFlg                     0
RVOwnerFlg                             0
IntTheatrePerformingArtsFlg            0
IntArtsFlg                             0
IntDomesticTravelFlg                   0
IntHomeStereosFlg                      0
IntMusicDevicesFlg                     0
IntMusicAvidList

In [32]:
# pint out all columns with null values
null_columns = data.columns[data.isnull().any()].tolist()
print("Columns with null values:")
print(null_columns)
print(data[null_columns].isnull().sum())

Columns with null values:
['FirstGameAttended', 'TotalGamesAttended', 'FirstGameBought', 'LastGameBought', 'TotalTicketsPurchased', 'TotalLifetimeValue', 'CurrentSeasonEmailActivities', 'PreviousSeasonsEmailActivities', 'City', 'PostalCd', 'Country', 'Education', 'Occupation', 'MaritalStatus', 'PresenceOfChildren', 'DwellingType', 'PrimaryVehicleType', 'MSADescription', 'DistanceToArena', 'NumberOfAdultsInHousehold', 'LengthOfResidence', 'DiscretionaryIncomeIndex', 'FinancialScore', 'NumberOfVehicles', 'Male18to24', 'Female18to24', 'Male25to34', 'Female25to34', 'Male35to44', 'Female35to44', 'Male45to54', 'Female45to54', 'Male55to65', 'Female55to64', 'Male65to75', 'Female65to74', 'Male75up', 'Female75up', 'ClientFirstSaleDateKey', 'TotalEvents', 'AvgTicketsPerEvent', 'TotalSpend', 'AvgSpendPerEvent', 'LowestTicketPrice', 'HighestTicketPrice', 'AvgDistanceTraveledLocal', 'SpendPerEventConcerts', 'SpendPerEventArts', 'SpendPerEventSports', 'SpendPerEventFamily', 'ClusterCode', 'ClusterDes

In [33]:
# make sure flag columns are of type int
flag_columns = [col for col in data.columns if col.endswith('Flg')]
for col in flag_columns:
    data[col] = data[col].astype(int)
# check the types of all columns
column_types = data.dtypes
for column, dtype in column_types.items():
    print(f"{column}: {dtype}")

KeepFlag: int64
GlobalKey: int64
SeasonKey: int64
FirstGameAttended: object
TotalGamesAttended: float64
FirstGameBought: object
LastGameBought: object
TotalTicketsPurchased: float64
TotalLifetimeValue: float64
CurrentSeasonEmailActivities: float64
PreviousSeasonsEmailActivities: float64
STMFlagCurr: int64
TicketingFanType: object
EmailFanType: object
FullSeasonBuyer: float64
HalfSeasonBuyer: float64
QuarterSeasonBuyer: float64
MiniPlanBuyer: float64
IndividualGameBuyer: float64
City: object
State: object
PostalCd: object
Country: object
Education: object
Occupation: object
Age: float64
MaritalStatus: object
PresenceOfChildren: object
DwellingType: object
PrimaryVehicleType: object
MSADescription: object
MailSuppresionFlg: int64
WorkingWomanFlg: int64
BankCardHolderFlg: int64
GasDepartmentRetailCardHolderFlg: int64
TravelEntertainmentCardHolderFlg: int64
CreditCardHolderUnknownTypeFlg: int64
PremiumCardHolderFlg: int64
UpscaleDepartmentStoreCardHolderFlg: int64
MailOrderResponderFlg: in

In [35]:
# drop shit columns
shittyCols = ['ClusterDescription', 'ClusterCode', 'ClientWalkUpBuyerFlg',
              'PrimaryVehicleType', ]

In [36]:
# Check for NaN values in the 'ClusterCode' column and count them
na_count = data['ClusterCode'].isna().sum()

# Print the count of NaN values
print(f"Number of NaN values in 'ClusterCode': {na_count}")

# print unique values
print(data['ClusterCode'].unique())



Number of NaN values in 'ClusterCode': 85263
[ 7.  0. 13.  3. nan  2. 17. 24.  8.  4. 62.  1. 11. 36. 12.  9. 47. 57.
 22. 40. 16. 23. 27. 65. 55. 14. 48. 43. 50.  5. 38.  6. 28. 26. 19. 30.
 35. 37. 49. 63. 21. 20. 46. 10. 53. 39. 69. 29. 33. 44. 51. 18. 31. 34.
 56. 15. 52. 42. 41. 25. 59. 60. 61. 68. 67. 58. 66. 45. 64. 32. 72. 54.
 71. 70.]


<h1>Save work to TigersCleaned.csv</h1>

In [37]:
#save work to a new csv file
data.to_csv('data/WingsCleaned.csv', index=False)

<h5>Cluster Analysis w/5 clusters</h5>

In [None]:
# print all columns with NaN values
nan_columns = data.columns[data.isnull().any()]
# print the name of the columns with NaN values
for nanCol in nan_columns:
    print(nanCol)

FirstGameAttended
TotalGamesAttended
FirstGameBought
LastGameBought
TotalTicketsPurchased
TotalLifetimeValue
PreviousSeasonsEmailActivities
FullSeasonBuyer
HalfSeasonBuyer
QuarterSeasonBuyer
MiniPlanBuyer
IndividualGameBuyer
City
PostalCd
Country
Education
Occupation
MaritalStatus
PresenceOfChildren
DwellingType
PrimaryVehicleType
MSADescription
MailSuppresionFlg
WorkingWomanFlg
BankCardHolderFlg
GasDepartmentRetailCardHolderFlg
TravelEntertainmentCardHolderFlg
CreditCardHolderUnknownTypeFlg
PremiumCardHolderFlg
UpscaleDepartmentStoreCardHolderFlg
MailOrderResponderFlg
TruckOwnerFlg
MotorcycleOwnerFlg
RVOwnerFlg
IntTheatrePerformingArtsFlg
IntArtsFlg
IntDomesticTravelFlg
IntHomeStereosFlg
IntMusicDevicesFlg
IntMusicAvidListenerFlg
IntMusicCollectorFlg
IntMovieCollectorFlg
IntAutoRacingFlg
IntFootballFlg
IntBaseballFlg
IntBasketballFlg
IntHockeyFlg
IntContestsFlg
IntSportsFlg
IntMusicMoviesFlg
IntNascarFlg
IntUpscaleLivingFlg
DistanceToArena
NumberOfAdultsInHousehold
LengthOfResidence
D

In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture

# Assuming 'data' is your DataFrame

# Create a Gaussian Mixture Model with 5 components (clusters)
gmm = GaussianMixture(n_components=5, random_state=0)

# Fit the model on the entire dataset (excluding any non-numeric columns)
gmm.fit(data_clean.select_dtypes(include=[float, int]))

# Predict the clusters for all data points
data_clean['Cluster'] = gmm.predict(data_clean.select_dtypes(include=[float, int]))

# Get the cluster centers (means of the Gaussian components)
clusterCenters = gmm.means_

# Create a DataFrame to map clusters to their centers
clusterMap = pd.DataFrame(clusterCenters, columns=data_clean.select_dtypes(include=[float, int]).columns)

# Now 'data' has an additional 'Cluster' column indicating the cluster each data point belongs to

NameError: name 'data_clean' is not defined