# Part 1 
## Pre-processing the data file and get the data

In [1]:
import pandas as pd
import cassandra
import os 
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_csv_file_paths(folder_path):
    """
    Recursively retrieves the absolute paths of all csv files in the specified folder and its subfolders.
    Args:
        folder_path (str): Path to the root folder containing JSON files.
    Returns:
        list: A list of absolute file paths.
    """
    csv_file_paths = []
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith(".csv"):
                csv_file_paths.append(os.path.join(root, filename))
    return csv_file_paths

In [3]:
# get all csv data pathes
csv_data = get_csv_file_paths('event_data/')

In [4]:
# get all coloumns we need 
columns = ['artist','firstName','gender','itemInSession','lastName','length','level','location','sessionId','song','userId']
event_data = pd.DataFrame(columns=columns) # create an empty data frame with same columns names
for i in range(len(csv_data)): # loop for each csv file
    df = pd.read_csv(csv_data[i],usecols=columns)
    df.dropna(subset = 'artist',inplace=True) # drop the row that has artist value Null
    event_data = pd.concat([event_data, df],ignore_index = True)
event_data.to_csv('event_datafile_new.csv',index=False)

In [5]:
len(event_data)

6831

#### Now we are ready to work with the CSV file titled <font color=red>event_datafile_new.csv</font>
##### The **``event_datafile_new.csv``** contains the following columns: 
- artist 
- firstName of user
- gender of user
- item number in session
- last name of user
- length of the song
- level (paid or free song)
- location of the user
- sessionId
- song title
- userId

The image below is a screenshot of what the denormalized data should appear like in the <font color=red>**event_datafile_new.csv**</font> after the code above is run:<br>

<img src="images/1.jpg">