In [90]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/YBI-Foundation/Dataset/main/Air%20Quality%20Missing%20Data.csv")

In [4]:
df

Unnamed: 0,Date,Ozone,Solar,Wind,Temp
0,01-05-1976,41.0,190.0,7.4,67
1,02-05-1976,36.0,118.0,8.0,72
2,03-05-1976,12.0,149.0,12.6,74
3,04-05-1976,18.0,313.0,11.5,62
4,05-05-1976,,,14.3,56
...,...,...,...,...,...
148,26-09-1976,30.0,193.0,6.9,70
149,27-09-1976,,145.0,13.2,77
150,28-09-1976,14.0,191.0,14.3,75
151,29-09-1976,18.0,131.0,8.0,76


In [5]:
df.describe()

Unnamed: 0,Ozone,Solar,Wind,Temp
count,116.0,146.0,153.0,153.0
mean,42.12931,185.931507,9.957516,77.882353
std,32.987885,90.058422,3.523001,9.46527
min,1.0,7.0,1.7,56.0
25%,18.0,115.75,7.4,72.0
50%,31.5,205.0,9.7,79.0
75%,63.25,258.75,11.5,85.0
max,168.0,334.0,20.7,97.0


# Q1 How many rows and columns are there in the dataset?

In [6]:
df.shape

(153, 5)

# Q2 What are the column names and their data types?

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    153 non-null    object 
 1   Ozone   116 non-null    float64
 2   Solar   146 non-null    float64
 3   Wind    153 non-null    float64
 4   Temp    153 non-null    int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 6.1+ KB


# Q3 What are the names of the columns in the dataset, and what do they represent?

In [23]:
df.head()

Unnamed: 0,Date,Ozone,Solar,Wind,Temp
0,01-05-1976,41.0,190.0,7.4,67
1,02-05-1976,36.0,118.0,8.0,72
2,03-05-1976,12.0,149.0,12.6,74
3,04-05-1976,18.0,313.0,11.5,62
4,05-05-1976,23.0,308.333333,14.3,56




*   **Date**   =======> date of observation
*  **Ozone**	 =======> o3 count
*   **Solar**  =======> solar count
*   **Wind**  =======> wind count
*  **Temp**   =======> temperature observe






# Q4 Are there any missing values in the dataset?

In [10]:
df.isnull().sum()

Date      0
Ozone    37
Solar     7
Wind      0
Temp      0
dtype: int64

**yes there are missing value**

# Q5 If there are missing values, how are they represented?



*   **NaN (Not a Number)**:   In many datasets, missing values are represented using the NaN value, which stands for "Not a Number." In Pandas, missing numerical values are typically represented as NaN.

*   **Null or Empty Values**:  Some datasets may represent missing values as null or empty strings in string columns.

*  **Placeholder Values**: In some cases, missing values are represented using specific placeholder values that are distinct from valid data. For example, a negative number, a zero, or a specific string could be used as a placeholder for missing data.

*   **Special Codes**: In certain datasets, special codes or symbols might be used to denote missing values. These codes are not valid data entries but serve as placeholders



In [15]:
df.isnull()

Unnamed: 0,Date,Ozone,Solar,Wind,Temp
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,True,True,False,False
...,...,...,...,...,...
148,False,False,False,False,False
149,False,True,False,False,False
150,False,False,False,False,False
151,False,False,False,False,False


In [16]:
df

Unnamed: 0,Date,Ozone,Solar,Wind,Temp
0,01-05-1976,41.0,190.0,7.4,67
1,02-05-1976,36.0,118.0,8.0,72
2,03-05-1976,12.0,149.0,12.6,74
3,04-05-1976,18.0,313.0,11.5,62
4,05-05-1976,,,14.3,56
...,...,...,...,...,...
148,26-09-1976,30.0,193.0,6.9,70
149,27-09-1976,,145.0,13.2,77
150,28-09-1976,14.0,191.0,14.3,75
151,29-09-1976,18.0,131.0,8.0,76


**Missing value are represented by Nan**

# Q6 What methods or strategies would you use to handle missing data in this dataset?

**using fillna method**e with interpolation

In [18]:
df.isnull().sum()

Date      0
Ozone    37
Solar     7
Wind      0
Temp      0
dtype: int64

In [20]:
df["Ozone"].interpolate(method = "linear" , inplace = True)
df["Solar"].interpolate(method = "linear" , inplace = True)

In [21]:
df.isnull().sum()

Date     0
Ozone    0
Solar    0
Wind     0
Temp     0
dtype: int64

# Q7 Are there any duplicated rows in the dataset? How would you deal with them if present?

In [25]:
df.duplicated().sum()

0

# Q8 What are some common visualizations you can create to better understand the data, such as histograms, scatter plots, or box plots?

In [27]:
df.columns

Index(['Date', 'Ozone', 'Solar', 'Wind', 'Temp'], dtype='object')

In [41]:
px.histogram(df , x = "Ozone" , y = "Solar")

In [43]:
px.scatter(df , x = "Wind" , y = "Ozone")

In [54]:
px.violin(df , x = "Wind" , y = "Temp")

In [55]:
px.line(df , x = "Solar" , y = "Wind")

# Q9 Can you visualize the distribution of Ozone over date

In [56]:
px.histogram(df , x = "Ozone" , y = "Date")

In [63]:
px.line(df , x = "Ozone" , y = "Date")

# 10 What is the average (mean) value of Ozone and Wind in the dataset?

In [64]:
df["Ozone"].mean()

43.290849673202615

In [65]:
df["Wind"].mean()

9.957516339869281

# Q11 What is the highest recorded value of O3 and when did it occur?

In [78]:
df["Ozone"].max()

168.0

In [80]:
df[df["Ozone"] == 168.0]

Unnamed: 0,Date,Ozone,Solar,Wind,Temp
116,25-08-1976,168.0,238.0,3.4,81


# Q12 Can you calculate the correlation between different air quality measurements

In [81]:
df.columns

Index(['Date', 'Ozone', 'Solar', 'Wind', 'Temp'], dtype='object')

In [82]:
df["Wind"].corr(df["Ozone"])

-0.5075341870235858

In [85]:
df.corr()





Unnamed: 0,Ozone,Solar,Wind,Temp
Ozone,1.0,0.159473,-0.507534,0.601636
Solar,0.159473,1.0,-0.02928,0.229697
Wind,-0.507534,-0.02928,1.0,-0.457988
Temp,0.601636,0.229697,-0.457988,1.0


# Q13 Is there is data time-stamped? If so, what is the time range covered by the dataset?

In [86]:
df["Date"]                         #  there is no time

0      01-05-1976
1      02-05-1976
2      03-05-1976
3      04-05-1976
4      05-05-1976
          ...    
148    26-09-1976
149    27-09-1976
150    28-09-1976
151    29-09-1976
152    30-09-1976
Name: Date, Length: 153, dtype: object

# Q14 How would you resample or aggregate the data to analyze it on a daily or monthly basis?

In [88]:
df.resample("M").mean()     # Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'RangeIndex'

TypeError: ignored

# Q15 Does the dataset include location information (latitude and longitude)?

In [89]:
df.head()

Unnamed: 0,Date,Ozone,Solar,Wind,Temp
0,01-05-1976,41.0,190.0,7.4,67
1,02-05-1976,36.0,118.0,8.0,72
2,03-05-1976,12.0,149.0,12.6,74
3,04-05-1976,18.0,313.0,11.5,62
4,05-05-1976,23.0,308.333333,14.3,56


**there is no location info**

# Q16 Can you apply a regression model to predict air quality measurements  based on other variables in the dataset?

In [91]:
df.columns

Index(['Date', 'Ozone', 'Solar', 'Wind', 'Temp'], dtype='object')

In [92]:
target = "Wind"
features = ["Date","Ozone" , "Solar","Temp"]

In [93]:
#  Split the data into training and testing sets

x = df[features].values
y = df[target].values

x_train , x_test , y_train , y_test = train_test_split(x, y ,test_size = 0.2 , random_state=42)

In [97]:
lr = LinearRegression()
y_pred = lr.predict(x_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)