In [1]:


import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('customers.csv')

# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(data.head())

# Change column data type
data['Age'] = data['Age'].astype(float)
print("\nColumn data types after changing 'Age' to float:")
print(data.dtypes)

# Explain index
print("\nIndex of the DataFrame:")
print(data.index)

# Rename column
data.rename(columns={'City_Location': 'City'}, inplace=True)
print("\nColumn names after renaming 'City_Location' to 'City':")
print(data.columns)

# Use describe
print("\nSummary statistics of the DataFrame:")
print(data.describe())

# Query the DataFrame
print("\nQuerying the DataFrame for customers with income over $100,000:")
high_income_customers = data[data['Income'] > 100000]
print(high_income_customers.head())

# Sorting
print("\nSorting the DataFrame by age in descending order:")
sorted_data = data.sort_values(by='Age', ascending=False)
print(sorted_data.head())

# Adding and dropping columns
data['New_Column'] = np.random.randint(0, 2, size=len(data))  # Adding a new column with random values
print("\nDataFrame after adding a new column 'New_Column':")
print(data.head())

data.drop(columns=['New_Column'], inplace=True)  # Dropping the newly added column
print("\nDataFrame after dropping the 'New_Column':")
print(data.head())

# Appending (concatenate) rows
# If you don't specify ignore_index=True, Pandas will preserve the original row index labels of the DataFrames being concatenated.
# This might result in duplicate index labels or non-sequential index labels in the resulting DataFrame, which can lead to
# confusion or errors in downstream analysis.
new_row = pd.DataFrame({'Age': [35], 'Gender': ['Male'], 'Income': [80000], 'Socks_Owned': [10], 'Frequent_Losers': [False], 'City': ['Chicago']})
data = pd.concat([data, new_row], ignore_index=True)
print("\nDataFrame after appending a new row:")
print(data.tail())



First few rows of the DataFrame:
   Age  Gender  Income  Socks_Owned  Frequent_Losers  City_Location
0   44  Female   25841            8            False      Charlotte
1   94    Male  135636           16            False     Washington
2   64  Female  116639           19            False       New York
3   40    Male   85988           20            False       New York
4   66    Male   80867            8            False  San Francisco

Column data types after changing 'Age' to float:
Age                float64
Gender              object
Income               int64
Socks_Owned          int64
Frequent_Losers       bool
City_Location       object
dtype: object

Index of the DataFrame:
RangeIndex(start=0, stop=100000, step=1)

Column names after renaming 'City_Location' to 'City':
Index(['Age', 'Gender', 'Income', 'Socks_Owned', 'Frequent_Losers', 'City'], dtype='object')

Summary statistics of the DataFrame:
                 Age         Income    Socks_Owned
count  100000.000000  100000.