In [None]:
import pandas as pd 

df = pd.read_csv('customers.csv')

print(df.head())       # first 5 rows
print(df.tail())       # last 5 rows
print(df.shape)        # (rows, columns)
print(df.info())       # summary info
print(df.describe())   # statistics (numerical cols)
print(df.columns)     # column names

filtered_df = df[df['City'] == 'Port Gavin']  # filter rows by condition
print(filtered_df)
print(sorted(df['City']))  # unique values in a column, sorted
count_city = df['City'].value_counts().sort_values(ascending=False)  # count occurrences of each unique value
print(count_city)

print(df.isna().sum())  # count missing values in each column

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')
# main insights from the data

print(df.head())
print(df.info())
print(df.describe())
print(df.shape)
print(df.columns)

# Are there missing values? (in age, salary, name)
print(df.isna().sum())
df["name"].fillna("Unknown", inplace=True)
df["age"].fillna(df["age"].median(), inplace=True)
df["salary"].fillna(df["salary"].median(), inplace=True)
# Are there any duplicates?
for col in ["name", "age", "city", "salary", "date"]:
    print(df[col].value_counts())

# Do all dates look valid and within range (2020–2023)?
df['date'] = df['date'].replace('2020-13-01', np.nan)  # replace invalid date with NaN
df['date']= pd.to_datetime(df['date'], errors='coerce')

print(df['date'].dt.year.between(2020, 2023).value_counts())


In [None]:

# How many employees (unique id) are there?

print(df['id'].nunique())


# Average, minimum, and maximum salary.

print("Average Salary:", df['salary'].mean())
print("Minimum Salary:", df['salary'].min())
print("Maximum Salary:", df['salary'].max())


# Average age of employees.
print("Average Age:", df['age'].mean())
# How many employees per city.
print(df["city"].value_counts())
df["city"].value_counts()
# Which names are most common (maybe spotting repeated entries).
print(df["city"].value_counts().head(2))
df

In [None]:
# Number of records per year (2020, 2021, 2022, 2023).
df["year"]=df["date"].dt.year
print(df["year"].value_counts())
# Average salary per year.
print(round(df.groupby("year")['salary'].mean(),2))
# Employee distribution trend over time.
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(data=df, x='year')
plt.show()
df

In [None]:
# Which city has the highest average salary?
# df
df.groupby('city')['salary'].mean().sort_values(ascending=False)


# Which city has the youngest workforce?
print(df.groupby('city')['age'].mean().sort_values(ascending=True).head(1))
df['youngest_city']=df["age"].between(10,30)
df.groupby('city')['youngest_city'].value_counts().head(1)

# Who are the top earners (top 5 salaries)?
print(df.nlargest(5, 'salary')[['name', 'salary']])
print(df.nlargest(5, 'salary')[[ 'city','salary' ]])
df

In [109]:

# Standardize names (e.g., if "Ali " has a trailing space).
df['name'] = df['name'].str.strip()  
# Remove duplicates if found.
df.drop_duplicates(inplace=True)
df

Unnamed: 0,id,name,age,city,salary,date,yearinRange,youngest_city
0,1,Mona,28.0,Cairo,6500.0,2021-02-28 05:57:50.140280560,True,True
1,2,Khaled,28.0,Cairo,4000.0,2023-03-22 04:37:02.044088176,True,True
2,3,Yara,35.0,Giza,6000.0,2021-04-21 21:55:54.709418840,True,False
3,4,Unknown,30.0,Mansoura,4000.0,2023-08-30 02:44:29.338677344,True,True
4,5,Unknown,22.0,Tanta,6000.0,2023-11-05 09:48:41.843687376,True,True
...,...,...,...,...,...,...,...,...
495,496,Unknown,28.0,Alex,5500.0,2023-08-09 15:11:54.228456912,True,True
496,497,Nada,30.0,Mansoura,6500.0,2020-05-20 10:34:52.184368736,True,True
497,498,Khaled,20.0,Mansoura,8000.0,2022-05-01 10:08:53.867735472,True,True
498,499,Mona,28.0,Giza,4000.0,2023-01-19 17:59:16.713426848,True,True
