# Problem Statement

Popular Baby Names by Sex and Ethnic Group Data were collected through civil birth registration. Each record represents the ranking of a baby name in the order of frequency. Data can be used to represent the popularity of a name. Caution should be used when assessing the rank of a baby name if the frequency count is close to 10; the ranking may vary year to year.

Variable | Definition | Key
| --- | --- | --- |
Year of Birth | The year the child was born | "e.g., 2011"
Gender |The sex of the child | "FEMALE, MALE"
Ethnicity | The reported ethnicity of the mother |"HISPANIC, WHITE NON HISPANIC, etc."
Child's First Name |The first name of the infant|String
Count|Number of children with this name|Integer
Rank|The popularity rank of the name|1 = Most Popular

In [None]:
import pandas as pd
import sqlite3
import threading

In [None]:
import pandas as pd

class BabyNamesDataset:
    #class is created for importing file and print basic data exploration
    
    def __init__(self, file_path):
        self.file_path = file_path
        self.df = None
        
    def load_data(self):
        #import csv file into pandas data fram
        try:
            self.df = pd.read_csv(self.file_path)
            print(f"file is loaded {len(self.df)} rows.")
        except Exception as e:
            print(f"error while loading data: {e}")

In [None]:
# NameAnalyzer is child class pf BabyNamesDataset
class NameAnalyzer(BabyNamesDataset):
    #performs analysis on the names
    
    def __init__(self, file_path):
        super().__init__(file_path)

    def top_names_by_year(self, year, top_n=5):
        #gives data according to passed year
        if self.df is None:
            return "data is not loaded. please call load_data() first."
            
        filtered = self.df[self.df['year of birth'] == year]
        return filtered.sort_values(by='count', ascending=False).head(top_n)
    def save_to_db(self):
        #saves data as file called 'popularNames.db'
        db_conn = sqlite3.connect("popularNames.db")
        self.df.to_sql("popularNames", db_conn, if_exists="replace")
        db_conn.close()
        print("saved in database.")

    def use_thread(self):
        # runs the saving process in the background
        task = threading.Thread(target=self.save_to_db)
        task.start()
        task.join()

In [None]:
#execution
if __name__ == "__main__":
    # creating objects
    analyzer = NameAnalyzer(r"C:\Users\User\Downloads\Popular_Baby_Names.csv")
    analyzer.load_data()  # this method is inherited from BabyNamesDataset
    analyzer.use_thread()
    print(analyzer.top_names_by_year(2012))

# Data Cleaning and Insights

In [None]:
import matplotlib.pyplot as plt

In [None]:
#some names were in uppercase, some of them in lowercase so I standartised them
analyzer.df["Child's First Name"] = analyzer.df["Child's First Name"].str.lower()

# Filter, Group and Get Top 10 in one flow
top_girls = analyzer.df[analyzer.df['Gender'] == 'FEMALE'].groupby("Child's First Name")['Count'].sum().nlargest(10)

    # plot
top_girls.sort_values().plot(kind='barh', color='pink', title='Top 10 Girls')
plt.show()

In [None]:
# Filter, Group and Get Top 10 in one flow
top_boys = analyzer.df[analyzer.df['Gender'] == 'MALE'].groupby("Child's First Name")['Count'].sum().nlargest(10)

    # plot
top_boys.sort_values().plot(kind='barh', color='blue', title='Top 10 Boys')
plt.show()