SAVING DATA

In [5]:
import pandas as pd

# Sample data with missing values
csv_data = """Name,Age,Department,Salary
Aarav,28,Sales,50000
Sneha,,Engineering,80000
Kabir,25,HR,
Anaya,29,,60000
Rohan,35,Engineering,85000
"""

with open("data.csv","w") as f:
  f.write(csv_data)

df = pd.read_csv('data.csv')
print(df)

    Name   Age   Department   Salary
0  Aarav  28.0        Sales  50000.0
1  Sneha   NaN  Engineering  80000.0
2  Kabir  25.0           HR      NaN
3  Anaya  29.0          NaN  60000.0
4  Rohan  35.0  Engineering  85000.0


FINDING NULL VALUES

In [6]:
print(df.isnull())

    Name    Age  Department  Salary
0  False  False       False   False
1  False   True       False   False
2  False  False       False    True
3  False  False        True   False
4  False  False       False   False


COUNTING NULL VALUES

In [7]:
print(df.isnull().sum())

Name          0
Age           1
Department    1
Salary        1
dtype: int64


FILLING AGE WITH AVERAGE

In [10]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
print(df)

    Name    Age   Department   Salary
0  Aarav  28.00        Sales  50000.0
1  Sneha  29.25  Engineering  80000.0
2  Kabir  25.00           HR      NaN
3  Anaya  29.00          NaN  60000.0
4  Rohan  35.00  Engineering  85000.0


FILLING DEPARTMENT AS GENERAL

In [11]:
df['Department'] = df['Department'].fillna('General')
print(df)

    Name    Age   Department   Salary
0  Aarav  28.00        Sales  50000.0
1  Sneha  29.25  Engineering  80000.0
2  Kabir  25.00           HR      NaN
3  Anaya  29.00      General  60000.0
4  Rohan  35.00  Engineering  85000.0


FILLING SALARY WITH AVERAGE

In [13]:
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
print(df)

    Name    Age   Department   Salary
0  Aarav  28.00        Sales  50000.0
1  Sneha  29.25  Engineering  80000.0
2  Kabir  25.00           HR  68750.0
3  Anaya  29.00      General  60000.0
4  Rohan  35.00  Engineering  85000.0


MERGE

In [15]:
import pandas as pd

# Create employees CSV
emp_data = """EmpID,Name,Department
101,Amit,Sales
102,Anita,Engineering
103,Kabir,HR
104,Sneha,Marketing
105,Ravi,Engineering
"""
with open('employees.csv', 'w') as f:
    f.write(emp_data)

# Create departments CSV
dept_data = """Department,Location
Sales,Mumbai
Engineering,Bangalore
HR,Delhi
Marketing,Chennai
"""
with open('departments.csv', 'w') as f:
    f.write(dept_data)

# Read both
df_emp = pd.read_csv('employees.csv')
df_dept = pd.read_csv('departments.csv')

print(df_emp)
print(df_dept)


   EmpID   Name   Department
0    101   Amit        Sales
1    102  Anita  Engineering
2    103  Kabir           HR
3    104  Sneha    Marketing
4    105   Ravi  Engineering
    Department   Location
0        Sales     Mumbai
1  Engineering  Bangalore
2           HR      Delhi
3    Marketing    Chennai


In [18]:
merged_df = pd.merge(df_emp,df_dept,on='Department',how='inner')
print(merged_df)

   EmpID   Name   Department   Location
0    101   Amit        Sales     Mumbai
1    102  Anita  Engineering  Bangalore
2    103  Kabir           HR      Delhi
3    104  Sneha    Marketing    Chennai
4    105   Ravi  Engineering  Bangalore


In [19]:
!ls


data.csv  departments.csv  employees.csv  sample_data


In [20]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
  .appName("PySparkBasics")\
  .getOrCreate()

spark

In [21]:
data = [('amit',25),('ravi',30),('priya',33)]
col = ['name','age']

df = spark.createDataFrame(data,col)
df.show()

+-----+---+
| name|age|
+-----+---+
| amit| 25|
| ravi| 30|
|priya| 33|
+-----+---+



In [22]:
import pandas as pd

# Sample data with missing values
csv_data = """Name,Age,Department,Salary
Aarav,28,Sales,50000
Sneha,,Engineering,80000
Kabir,25,HR,
Anaya,29,,60000
Rohan,35,Engineering,85000
"""

with open("data.csv","w") as f:
  f.write(csv_data)
df = spark.read.csv('data.csv',header = True,inferSchema=True)
df.show()
df.printSchema()

+-----+----+-----------+------+
| Name| Age| Department|Salary|
+-----+----+-----------+------+
|Aarav|  28|      Sales| 50000|
|Sneha|NULL|Engineering| 80000|
|Kabir|  25|         HR|  NULL|
|Anaya|  29|       NULL| 60000|
|Rohan|  35|Engineering| 85000|
+-----+----+-----------+------+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)

