# Data Cleaning

## Handling Missing Values

In [4]:
import pandas as pd
data = {'Name':['Alice','Bob','Charlie', None], 'Age':[25, None, 23, 30]}
df = pd.DataFrame(data)
# df_filled = df.fillna('pypy')
# this just fill all None as 'pypy'
df_filled = df.fillna({'Name':'Unknown','Age':0})
print(df_filled)

      Name   Age
0    Alice  25.0
1      Bob   0.0
2  Charlie  23.0
3  Unknown  30.0


## Removing Duplicates

In [14]:
data = {'Name':['Alice','Bob','Alice'],'Age':[25, 26, 25]}
df = pd.DataFrame(data)
df_unique=df.drop_duplicates()
print(df_unique)

    Name  Age
0  Alice   25
1    Bob   26


# Converting Data Types

In [23]:
data = {'Name': ['Alice', 'Bob'], 'Age': ['25', '26']} 
df = pd.DataFrame(data)
df['Age'] = df['Age'].astype(int)
# pd.DataFrame(data)['Age'].astype(int)
# print(pd.DataFrame(data)['Age'].astype(int).dtypes)
print(df.dtypes)

Name    object
Age      int32
dtype: object


# Data Transformation

## Filtering and Sorting Data

In [27]:
# Creating a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 26, 23]}
df = pd.DataFrame(data)

# Filtering rows where Age is greater than 24
filtered_df = df[df['Age'] > 24]

# Sorting DataFrame by Age
sorted_df = df.sort_values(by='Age')
print(filtered_df)
print(sorted_df)


    Name  Age
0  Alice   25
1    Bob   26
      Name  Age
2  Charlie   23
0    Alice   25
1      Bob   26


## Merging and Joining Datasets

In [32]:
# Creating two DataFrames
data1 = {'Name': ['Alice', 'Bob'], 'Age': [25, 26]}
data2 = {'Name': ['Charlie', 'David'], 'Age': [23, 30]}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Concatenating DataFrames
merged_df = pd.concat([df1, df2]).reset_index(drop=True)
# .reset_index(drop=True) this will reset index from 0101 to 1-4
print(merged_df)


      Name  Age
0    Alice   25
1      Bob   26
2  Charlie   23
3    David   30


## Grouping and Aggregating Data :

In [35]:
# Creating a DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'Alice'], 'Score': [85, 90, 78, 88]}
df = pd.DataFrame(data)

# Grouping by 'Name' and calculating the mean Score
grouped_df = df.groupby('Name').mean()
print(grouped_df)


         Score
Name          
Alice     86.5
Bob       90.0
Charlie   78.0


# Data Analysis

# Descriptive Statistics

In [6]:
# Creating a DataFrame
data = {'Age': [25, 26, 23, 30]}
df = pd.DataFrame(data)

# Calculating descriptive statistics
stats = df.describe()
print(stats)


            Age
count   4.00000
mean   26.00000
std     2.94392
min    23.00000
25%    24.50000
50%    25.50000
75%    27.00000
max    30.00000


## Correlation and Covariance

In [9]:
# Creating a DataFrame
data = {'A': [1, 2, 3, 4], 'B': [4, 3, 2, 1]}
df = pd.DataFrame(data)

# Calculating correlation
correlation = df.corr()
print(correlation)

# Calculating covariance
covariance = df.cov()
print(covariance)

     A    B
A  1.0 -1.0
B -1.0  1.0
          A         B
A  1.666667 -1.666667
B -1.666667  1.666667


## Time Series Analysis

In [12]:
# Creating a time series DataFrame
dates = pd.date_range(start='2020-01-01', periods=4, freq='D')
data = {'Value': [10, 20, 15, 30]}
df = pd.DataFrame(data, index=dates)

# Calculating rolling mean
rolling_mean = df.rolling(window=2).mean()
print(rolling_mean)


            Value
2020-01-01    NaN
2020-01-02   15.0
2020-01-03   17.5
2020-01-04   22.5
