### Imports necessary libraries for data manipulation, visualization, model training, and evaluation.

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split , cross_val_score , GridSearchCV , RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder , OneHotEncoder , StandardScaler , OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBRFClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier , RandomForestClassifier , GradientBoostingClassifier
from sklearn.metrics import confusion_matrix , accuracy_score , classification_report
from sklearn.pipeline import make_pipeline , Pipeline
from sklearn.compose import ColumnTransformer , make_column_transformer

### Reads a CSV file ('Popular_Baby_Names.csv') into a DataFrame and displays the first few rows.

In [3]:
names = ['year_of_birth' , 'gender' , 'ethnicity' , 'child_first_name' , 'count' , 'rank']
df = pd.read_csv('Popular_Baby_Names.csv' , names=names , header=0)
df.head()

Unnamed: 0,year_of_birth,gender,ethnicity,child_first_name,count,rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


In [4]:
df['child_first_name'].shape

(57582,)

In [5]:
df['rank'].unique()

array([ 75,  67,  42,  51,  53,  62,   8,  74,  71,  78,  73,  72,  77,
        60,   1,  70,  61,  65,  58,  41,  76,  68,  48,  55,  40,  66,
        64,  34,  44,  57,  63,  33,  31,  20,  69,  35,  13,  52,  59,
        39,   9,  27,  10,  56,  12,   2,  25,  18,  14,  38,  28,   6,
         3,  19,  45,  47,  11,  79,  17,  43,  80,  37,  81,  46,   5,
        22,  50,  21,  30,  24,  54,  15,  36,  23,   7,  16,  49,  29,
         4,  32,  26,  92,  90,  82,  91,  88,  89,  94,  83,  93,  84,
        87,  85,  86,  96,  97,  95,  99,  98, 100, 101, 102], dtype=int64)

In [6]:
df['rank'].nunique()

102

- df['rank'] = 1 to 25 = high_popular
- df['rank'] = 25 to 50 = popular
- df['rank'] = 50 to 75 = medium_popular
- df['rank'] = 75 to _ = low_popular

### Updates the 'target' column in the DataFrame based on specified rank ranges.

In [7]:
df['target'] = 'less_popular'

# Update 'target' based on the specified ranges
df.loc[(df['rank'] >= 1) & (df['rank'] <= 25), 'target'] = 'high_popular'
df.loc[(df['rank'] > 25) & (df['rank'] <= 50), 'target'] = 'popular'
df.loc[(df['rank'] > 50) & (df['rank'] <= 75), 'target'] = 'medium_popular'

In [8]:
df['target'].unique()

array(['medium_popular', 'popular', 'high_popular', 'less_popular'],
      dtype=object)

In [9]:
df['target'].value_counts()

target
less_popular      17373
medium_popular    16806
popular           15941
high_popular       7462
Name: count, dtype: int64

In [10]:
mapping = {
    "high_popular" : 1,
    "popular" : 2,
    "medium_popular" : 3,
    "less_popular" : 4
}
df['target'] = df['target'].map(mapping)
df['target'].value_counts()

target
4    17373
3    16806
2    15941
1     7462
Name: count, dtype: int64

In [11]:
nominal_cols = ['gender','ethnicity']
label_col = ['child_first_name']
standard_scaling_data = ['year_of_birth', 'gender', 'ethnicity', 'child_first_name', 'count']

In [12]:
X = df.drop(columns=['rank','target'])
y = df['target']

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

#label_encode_cols = ['child_first_name']
one_hot_encode_cols = ['gender', 'ethnicity','child_first_name']
scale_cols = ['count']

preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot_encode', OneHotEncoder(drop='first',handle_unknown='ignore' , sparse_output=False), one_hot_encode_cols),
        ('scale', StandardScaler(), ['count']),
    ],
    remainder='passthrough' 
)

# Create the final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [14]:
X_train_transformed = pipeline.fit_transform(X_train)
column_names = (pipeline.named_steps['preprocessor']
                .named_transformers_['one_hot_encode']
                .get_feature_names_out(input_features=one_hot_encode_cols))
column_names = list(column_names) + ['count'] + ['year_of_birth']

X_train_transformed_df = pd.DataFrame(X_train_transformed,columns=column_names)
X_train_transformed_df.head()

Unnamed: 0,gender_MALE,ethnicity_ASIAN AND PACIFIC ISLANDER,ethnicity_BLACK NON HISP,ethnicity_BLACK NON HISPANIC,ethnicity_HISPANIC,ethnicity_WHITE NON HISP,ethnicity_WHITE NON HISPANIC,child_first_name_AARAV,child_first_name_AARON,child_first_name_AAYAN,...,child_first_name_Zion,child_first_name_Zissy,child_first_name_Zoe,child_first_name_Zoey,child_first_name_Zora,child_first_name_Zoya,child_first_name_Zuri,child_first_name_Zyaire,count,year_of_birth
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.005663,2011.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.773576,2012.0
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.414455,2011.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.483347,2019.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.260954,2012.0


In [37]:
X_test_transformed = pipeline.transform(X_test)
column_names = (pipeline.named_steps['preprocessor']
                .named_transformers_['one_hot_encode']
                .get_feature_names_out(input_features=one_hot_encode_cols))
column_names = list(column_names) + ['count'] + ['year_of_birth']

X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=column_names)
X_test_transformed_df.tail()

Unnamed: 0,gender_MALE,ethnicity_ASIAN AND PACIFIC ISLANDER,ethnicity_BLACK NON HISP,ethnicity_BLACK NON HISPANIC,ethnicity_HISPANIC,ethnicity_WHITE NON HISP,ethnicity_WHITE NON HISPANIC,child_first_name_AARAV,child_first_name_AARON,child_first_name_AAYAN,...,child_first_name_Zion,child_first_name_Zissy,child_first_name_Zoe,child_first_name_Zoey,child_first_name_Zora,child_first_name_Zoya,child_first_name_Zuri,child_first_name_Zyaire,count,year_of_birth
11512,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.355089,2014.0
11513,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.953137,2012.0
11514,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.380741,2019.0
11515,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.611604,2014.0
11516,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.355089,2014.0


In [36]:
X_test_transformed_df['child_first_name_Abdul'].unique()

array([0., 1.])

In [16]:
X_test_transformed_df.shape

(11517, 3165)

In [17]:
X_train_transformed_df.shape

(46065, 3165)

In [18]:
model_list = {
    "LogisticRegression" : LogisticRegression(),
    "RandomForestClassifier" : RandomForestClassifier(),
    "DecisionTreeClassifier" : DecisionTreeClassifier(),
}

In [19]:
lr = LogisticRegression()
lr.fit(X_train_transformed,y_train)

In [20]:
y_pred = lr.predict(X_test_transformed)
accuracy_score(y_test,y_pred)

0.8403230007814535

In [30]:
df.head()

Unnamed: 0,year_of_birth,gender,ethnicity,child_first_name,count,rank,target
0,2011,FEMALE,HISPANIC,GERALDINE,13,75,3
1,2011,FEMALE,HISPANIC,GIA,21,67,3
2,2011,FEMALE,HISPANIC,GIANNA,49,42,2
3,2011,FEMALE,HISPANIC,GISELLE,38,51,3
4,2011,FEMALE,HISPANIC,GRACE,36,53,3


In [32]:
data = [[2011,"FEMALE","HISPANIC","GERALDINE",13]]
data_df = pd.DataFrame(data, columns=['year_of_birth', 'gender', 'ethnicity', 'child_first_name', 'count'])

# Transform the data using the pipeline
X_transformed = pipeline.transform(data_df)

# Make predictions using the trained classifier
predictions = lr.predict(X_transformed)

print(predictions)

[3]
