In [1]:

import pandas as pd

# Sample data with missing values
csv_data = """Name,Age,Department,Salary
Aarav,28,Sales,50000
Sneha,,Engineering,80000
Kabir,25,HR,
Anaya,29,,60000
Rohan,35,Engineering,85000
"""

with open('employees_missing.csv', 'w') as file:
    file.write(csv_data)

df = pd.read_csv('employees_missing.csv')
print(df)

print(df.isnull())      # Shows True/False for missing entries
print(df.isnull().sum()) # Count of missing values in each column


    Name   Age   Department   Salary
0  Aarav  28.0        Sales  50000.0
1  Sneha   NaN  Engineering  80000.0
2  Kabir  25.0           HR      NaN
3  Anaya  29.0          NaN  60000.0
4  Rohan  35.0  Engineering  85000.0
    Name    Age  Department  Salary
0  False  False       False   False
1  False   True       False   False
2  False  False       False    True
3  False  False        True   False
4  False  False       False   False
Name          0
Age           1
Department    1
Salary        1
dtype: int64


In [2]:
# CSV data already created in previous steps

with open('employees_missing.csv', 'w') as file:
    file.write(csv_data)

df = pd.read_csv('employees_missing.csv')
print(df)

print(df.isnull())        # Shows True/False for missing entries
print(df.isnull().sum())  # Count of missing values in each column

# Drop rows with any missing data
df_cleaned = df.dropna()
print(df_cleaned)

# Fill missing 'Age' with mean value
df['Age'] = df['Age'].fillna(df['Age'].mean())
print(df)

# Fill missing 'Department' with a placeholder string
df['Department'] = df['Department'].fillna("Unknown")
print(df)

# Fill missing 'Salary' with 0
df['Salary'] = df['Salary'].fillna(0)
print(df)



    Name   Age   Department   Salary
0  Aarav  28.0        Sales  50000.0
1  Sneha   NaN  Engineering  80000.0
2  Kabir  25.0           HR      NaN
3  Anaya  29.0          NaN  60000.0
4  Rohan  35.0  Engineering  85000.0
    Name    Age  Department  Salary
0  False  False       False   False
1  False   True       False   False
2  False  False       False    True
3  False  False        True   False
4  False  False       False   False
Name          0
Age           1
Department    1
Salary        1
dtype: int64
    Name   Age   Department   Salary
0  Aarav  28.0        Sales  50000.0
4  Rohan  35.0  Engineering  85000.0
    Name    Age   Department   Salary
0  Aarav  28.00        Sales  50000.0
1  Sneha  29.25  Engineering  80000.0
2  Kabir  25.00           HR      NaN
3  Anaya  29.00          NaN  60000.0
4  Rohan  35.00  Engineering  85000.0
    Name    Age   Department   Salary
0  Aarav  28.00        Sales  50000.0
1  Sneha  29.25  Engineering  80000.0
2  Kabir  25.00           HR    

In [1]:
import pandas as pd

# Create employees CSV
emp_data = """EmpID,Name,Department
101,Amit,Sales
102,Anita,Engineering
103,Kabir,HR
104,Sneha,Marketing
105,Ravi,Engineering
"""
with open('employees.csv', 'w') as f:
    f.write(emp_data)

# Create departments CSV
dept_data = """Department,Location
Sales,Mumbai
Engineering,Bangalore
HR,Delhi
Marketing,Chennai
"""
with open('departments.csv', 'w') as f:
    f.write(dept_data)

# Read both
df_emp = pd.read_csv('employees.csv')
df_dept = pd.read_csv('departments.csv')

print(df_emp)
print(df_dept)

   EmpID   Name   Department
0    101   Amit        Sales
1    102  Anita  Engineering
2    103  Kabir           HR
3    104  Sneha    Marketing
4    105   Ravi  Engineering
    Department   Location
0        Sales     Mumbai
1  Engineering  Bangalore
2           HR      Delhi
3    Marketing    Chennai


In [2]:
# Merge employee data with department location
df_combined = pd.merge(df_emp, df_dept, on='Department')

print(df_combined)


   EmpID   Name   Department   Location
0    101   Amit        Sales     Mumbai
1    102  Anita  Engineering  Bangalore
2    103  Kabir           HR      Delhi
3    104  Sneha    Marketing    Chennai
4    105   Ravi  Engineering  Bangalore


In [3]:
merged_df = pd.merge(df_emp, df_dept, on='Department', how='inner')
print(merged_df)


   EmpID   Name   Department   Location
0    101   Amit        Sales     Mumbai
1    102  Anita  Engineering  Bangalore
2    103  Kabir           HR      Delhi
3    104  Sneha    Marketing    Chennai
4    105   Ravi  Engineering  Bangalore


In [4]:
merged_df = pd.merge(df_emp, df_dept, on='Department', how='left')
print(merged_df)


   EmpID   Name   Department   Location
0    101   Amit        Sales     Mumbai
1    102  Anita  Engineering  Bangalore
2    103  Kabir           HR      Delhi
3    104  Sneha    Marketing    Chennai
4    105   Ravi  Engineering  Bangalore


In [5]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("PySparkBasics") \
    .getOrCreate()

# Sample data
data = [("Amit", 25), ("Sneha", 30), ("Kabir", 28)]
columns = ["Name", "Age"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show data
df.show()


+-----+---+
| Name|Age|
+-----+---+
| Amit| 25|
|Sneha| 30|
|Kabir| 28|
+-----+---+



In [6]:
csv_data = """Name,Department,Salary
Amit,Sales,50000
Sneha,Engineering,80000
Kabir,HR,45000
Anaya,Marketing,60000
Ravi,Engineering,85000
"""

with open('employees.csv', 'w') as f:
    f.write(csv_data)

df = spark.read.csv('employees.csv', header=True, inferSchema=True)
df.show()
df.printSchema()


+-----+-----------+------+
| Name| Department|Salary|
+-----+-----------+------+
| Amit|      Sales| 50000|
|Sneha|Engineering| 80000|
|Kabir|         HR| 45000|
|Anaya|  Marketing| 60000|
| Ravi|Engineering| 85000|
+-----+-----------+------+

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)

