In [None]:
# Import pandas and numpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn-whitegrid')
sns.set_style("whitegrid")

print("Libraries imported successfully!")


In [None]:
# Creating a Series
s = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])
print("Pandas Series:")
print(s)
print("\nSeries index:")
print(s.index)
print("\nSeries values:")
print(s.values)

# Creating a DataFrame from a dictionary
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda', 'Bob'],
    'Age': [28, 34, 29, 42, 37],
    'City': ['New York', 'Paris', 'Berlin', 'London', 'Tokyo'],
    'Salary': [65000, 70000, 62000, 85000, 72000],
    'Department': ['IT', 'HR', 'IT', 'Finance', 'Marketing']
}

df = pd.DataFrame(data)
print("\nDataFrame from dictionary:")
print(df)

# Creating a DataFrame from a list of lists
data_list = [
    ['John', 28, 'New York', 65000, 'IT'],
    ['Anna', 34, 'Paris', 70000, 'HR'],
    ['Peter', 29, 'Berlin', 62000, 'IT'],
    ['Linda', 42, 'London', 85000, 'Finance'],
    ['Bob', 37, 'Tokyo', 72000, 'Marketing']
]

columns = ['Name', 'Age', 'City', 'Salary', 'Department']
df2 = pd.DataFrame(data_list, columns=columns)
print("\nDataFrame from list of lists:")
print(df2)


In [None]:
# Create a sample CSV file
csv_data = '''id,name,age,city,salary,hire_date,department,performance_score
1,John Smith,34,New York,75000,2020-03-15,Engineering,4.2
2,Sarah Johnson,28,San Francisco,82000,2019-11-10,Analytics,4.5
3,Michael Brown,45,Chicago,65000,2018-06-22,Marketing,3.9
4,Emily Davis,31,Boston,78000,2021-01-05,Engineering,4.1
5,Robert Wilson,39,Seattle,90000,2017-09-30,Analytics,4.7
6,Jennifer Lee,33,Austin,72000,2019-04-18,Marketing,4.0
7,David Miller,41,Denver,85000,2016-11-08,Engineering,4.3
8,Lisa Wang,36,Portland,79000,2018-02-15,Analytics,4.4
9,James Taylor,44,Atlanta,68000,2017-07-22,Marketing,3.8
10,Amanda Garcia,29,Miami,71000,2020-09-12,Engineering,4.2
'''

# Write to CSV file
with open('employee_data.csv', 'w') as f:
    f.write(csv_data)

# Load CSV file
df = pd.read_csv('employee_data.csv')

# Basic inspection
print("First 5 rows (head):")
print(df.head())

print("\nLast 5 rows (tail):")
print(df.tail())

print("\nDataFrame shape (rows, columns):")
print(df.shape)

print("\nDataFrame info:")
print(df.info())

print("\nDataFrame columns:")
print(df.columns)

print("\nDataFrame data types:")
print(df.dtypes)

print("\nBasic statistics:")
print(df.describe())


In [None]:
# Selecting columns
print("Selecting a single column:")
print(df['name'])

print("\nSelecting multiple columns:")
print(df[['name', 'department', 'salary']])

# Filtering rows
print("\nFiltering rows where age > 35:")
print(df[df['age'] > 35])

print("\nFiltering rows where department is Engineering:")
print(df[df['department'] == 'Engineering'])

print("\nComplex filtering (AND condition):")
print(df[(df['age'] > 30) & (df['salary'] > 75000)])

print("\nComplex filtering (OR condition):")
print(df[(df['department'] == 'Analytics') | (df['performance_score'] > 4.5)])

# Sorting
print("\nSorting by age (ascending):")
print(df.sort_values('age'))

print("\nSorting by salary (descending):")
print(df.sort_values('salary', ascending=False))

print("\nSorting by multiple columns:")
print(df.sort_values(['department', 'salary'], ascending=[True, False]))


In [None]:
# Adding a new column
df['bonus'] = df['salary'] * df['performance_score'] / 100
print("DataFrame with new bonus column:")
print(df[['name', 'salary', 'performance_score', 'bonus']].head())

# Applying a function to a column
def experience_level(age):
    if age < 30:
        return 'Junior'
    elif age < 40:
        return 'Mid-level'
    else:
        return 'Senior'

df['experience'] = df['age'].apply(experience_level)
print("\nDataFrame with experience level:")
print(df[['name', 'age', 'experience']].head())

# Grouping and aggregation
print("\nAverage salary by department:")
print(df.groupby('department')['salary'].mean())

print("\nMultiple aggregations by department:")
print(df.groupby('department').agg({
    'salary': ['mean', 'min', 'max', 'count'],
    'age': ['mean', 'min', 'max'],
    'performance_score': ['mean']
}))

print("\nAverage salary and performance score by experience level:")
print(df.groupby('experience').agg({
    'salary': 'mean',
    'performance_score': 'mean'
}))


In [None]:
# Create a DataFrame with missing values
data_missing = df.copy()
# Introduce some missing values
data_missing.loc[0, 'salary'] = np.nan
data_missing.loc[2, 'age'] = np.nan
data_missing.loc[4, 'performance_score'] = np.nan
data_missing.loc[7, 'department'] = np.nan

print("DataFrame with missing values:")
print(data_missing.head(8))

# Check for missing values
print("\nMissing value count by column:")
print(data_missing.isna().sum())

# Drop rows with missing values
print("\nDrop rows with any missing values:")
print(data_missing.dropna().shape)

# Fill missing values
print("\nFill missing numeric values with mean:")
numeric_cols = ['age', 'salary', 'performance_score']
data_filled = data_missing.copy()
for col in numeric_cols:
    data_filled[col] = data_filled[col].fillna(data_filled[col].mean())

print(data_filled[['name', 'age', 'salary', 'performance_score']].head(8))

# Fill categorical missing values
print("\nFill missing categorical values with mode:")
data_filled['department'] = data_filled['department'].fillna(data_filled['department'].mode()[0])
print(data_filled[['name', 'department']].head(8))


In [None]:
# Basic visualizations with pandas

# Bar chart of average salary by department
plt.figure(figsize=(10, 6))
df.groupby('department')['salary'].mean().plot(kind='bar', color='skyblue')
plt.title('Average Salary by Department')
plt.xlabel('Department')
plt.ylabel('Average Salary ($)')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Histogram of ages
plt.figure(figsize=(10, 6))
df['age'].plot(kind='hist', bins=10, color='lightgreen', edgecolor='black')
plt.title('Distribution of Employee Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Scatter plot of age vs. salary with department color coding
plt.figure(figsize=(10, 6))
for dept, group in df.groupby('department'):
    plt.scatter(group['age'], group['salary'], label=dept, alpha=0.7)
plt.title('Age vs. Salary by Department')
plt.xlabel('Age')
plt.ylabel('Salary ($)')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

# Box plot of performance scores by department
plt.figure(figsize=(10, 6))
sns.boxplot(x='department', y='performance_score', data=df)
plt.title('Performance Score Distribution by Department')
plt.xlabel('Department')
plt.ylabel('Performance Score')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
