Title of Project
Wine Quality Analysis

Objective
To analyze and understand factors affecting wine quality, preprocess the data, and build predictive models to estimate wine quality based on its features.

Data Source
The dataset used is wine_data.csv, which contains various attributes of wines along with their quality scores.

In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv('/content/wine_data.csv')

In [None]:
print("First 10 rows of the dataset:")
print(df.head(10))
print("Last 10 rows of the dataset:")
print(df.tail(10))
print("Dimensions of the dataset:")
print(df.shape)
print("Data types of each column:")
print(df.dtypes)


First 10 rows of the dataset:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            NaN              0.30         0.34             1.6      0.049   
2            NaN               NaN         0.40             6.9        NaN   
3            NaN              0.23         0.32             8.5      0.058   
4            7.2               NaN         0.32             8.5        NaN   
5            8.1               NaN         0.40             6.9        NaN   
6            6.2              0.32         0.16             7.0        NaN   
7            7.0               NaN         0.36            20.7      0.045   
8            NaN              0.30         0.34             1.6      0.049   
9            8.1               NaN         0.43             NaN        NaN   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0               

Rows with a quality score of 8:
      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
17              NaN              0.66         0.48             1.2      0.029   
20              6.2              0.66         0.48             1.2        NaN   
22              NaN              0.26         0.42             1.7      0.049   
74              NaN              0.23         0.31             2.1      0.046   
158             5.2               NaN         0.04             1.4      0.036   
...             ...               ...          ...             ...        ...   
5988            NaN               NaN         0.54             1.9        NaN   
6100            8.6              0.42         0.39             NaN        NaN   
6167            5.5              0.49          NaN             1.8      0.044   
6301            7.2              0.33         0.33             NaN      0.061   
6347            NaN              0.38         0.31             NaN      0.056

In [None]:
#print(df.mean())
#print(df.max() - df.min())
#print(df.mean().idxmax())
#print(df.sum())
#print(len(df[df["quality"] > 7]))
#print(df.corr()["quality"][abs(df.corr()["quality"]) > 0.5].index.values)

# determine which attribute(s) have the greatest impact on quality according to a linear regression model
# import statsmodels.api as sm
# X = sm.add_constant(df.iloc[:, :-1])
# y = df.iloc[:, -1]
# model = sm.OLS(y, X).fit()
# print(model.summary())
print(len(df[df["residual sugar"] > 5])/len(df) * 100)
print(df.groupby("quality")["volatile acidity"].mean())
df["alcohol_level"] = pd.cut(df["alcohol"], bins=[0, 9, 12, 15], labels=["low", "medium", "high"])
df["pH_category"] = pd.cut(df["pH"], bins=[0, 7, 8, 15], labels=["acidic", "neutral", "basic"])
print(df.groupby(["alcohol_level", "pH_category"])["quality"].mean())
#print(df.groupby("quality").mean())
print(df.groupby("quality").min())
df["good_bad"] = pd.cut(df["quality"], bins=[0, 5, 10], labels=["bad", "good"])
print(df.head(2))
df["alcohol_level"] = pd.cut(df["alcohol"], bins=[0, 9, 12, 15], labels=["low", "medium", "high"])
print(df.head(2))
df["residual_sugar_level"] = pd.cut(df["residual sugar"], bins=[0, 2, 4, 15], labels=["low", "medium", "high"])
print(df.head(2))
def categorize_pH(pH):
    if pH < 7:
        return "acidic"
    elif pH < 8:
        return "neutral"
    else:
        return "basic"

df["pH_category"] = df["pH"].apply(categorize_pH)
print(df)
#print(df.groupby(["alcohol_level", "quality"]).mean())
print(df.groupby("quality").max())






31.98399261197476
quality
3.0    0.537333
4.0    0.460556
5.0    0.390931
6.0    0.310293
7.0    0.289971
8.0    0.290530
9.0    0.270000
Name: volatile acidity, dtype: float64
alcohol_level  pH_category
low            acidic         5.616725
               neutral             NaN
               basic               NaN
medium         acidic         5.716286
               neutral             NaN
               basic               NaN
high           acidic         6.509259
               neutral             NaN
               basic               NaN
Name: quality, dtype: float64
         fixed acidity  volatile acidity  citric acid  residual sugar  \
quality                                                                 
3.0                5.8              0.20         0.00             0.7   
4.0                4.6              0.16         0.00             0.7   
5.0                4.7              0.10         0.00             0.6   
6.0                3.8              0.08         0

  print(df.groupby(["alcohol_level", "pH_category"])["quality"].mean())


In [None]:
df=pd.read_csv('/content/wine_data.csv')
print(df.isnull().sum())
print(df['chlorides'].isnull().sum() / len(df) * 100)
print(len(df.dropna()))
df['fixed acidity'].fillna(df['fixed acidity'].mean(), inplace=True)

# Drop last two columns with missing values?
# Check which columns have missing values
missing_cols = df.columns[df.isnull().any()]
# Check if the last two columns have missing values
last_two_cols = missing_cols[-2:]
df.drop(columns=last_two_cols, inplace=True)
df['citric acid'].fillna(method='ffill', inplace=True)
print(df.head(2))
print(df['fixed acidity'].isnull().sum() + df['volatile acidity'].isnull().sum())
print(df['density'].isnull().sum() / len(df) * 100)
df['total sulfur dioxide'].fillna(df['total sulfur dioxide'].median(), inplace=True)
print(df['total sulfur dioxide'].median())
print(df[['residual sugar', 'pH']].corr())
df['residual sugar'].fillna(df['residual sugar'].mean(), inplace=True)
df['pH'].fillna(df['pH'].mean(), inplace=True)
print(df[['residual sugar', 'pH']].corr())
df['chlorides'].fillna(df['chlorides'].mode()[0], inplace=True)
print(df['chlorides'].nunique())

fixed acidity           1261
volatile acidity        1279
citric acid             1245
residual sugar          1276
chlorides               1277
free sulfur dioxide     1242
total sulfur dioxide    1202
density                 1247
pH                      1220
sulphates               1226
alcohol                 1255
quality                 1242
dtype: int64
19.655225488687087
487
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       7.000000              0.27         0.36            20.7      0.045   
1       7.210074              0.30         0.34             1.6      0.049   

   free sulfur dioxide  total sulfur dioxide  density   pH  sulphates  
0                 45.0                 170.0    1.001  3.0       0.45  
1                 14.0                   NaN    0.994  3.3       0.49  
1279
19.193473911035863
118.0
                residual sugar        pH
residual sugar        1.000000 -0.291418
pH                   -0.291418  1.000000
            

  df['citric acid'].fillna(method='ffill', inplace=True)


In [None]:
df=pd.read_csv('/content/wine_data.csv')
sorted_df = df.sort_values('alcohol')
filtered_df = df[df['pH'] < 3.0]
sorted_df = df.sort_values('quality', ascending=False)
filtered_df = df[df['residual sugar'] > 3.0]
sorted_df = df.sort_values(['fixed acidity', 'pH'], ascending=[True, False])
median_alcohol = df['alcohol'].median()
filtered_df = df[df['alcohol'] > median_alcohol]
sorted_df = df.sort_values(['fixed acidity', 'volatile acidity'], ascending=[True, False])
filtered_df = df[(df['chlorides'] >= 0.04) & (df['chlorides'] <= 0.06)]
sorted_df = df.sort_values(['density', 'residual sugar'], ascending=[False, True])
mean_pH = df['pH'].mean()
median_alcohol = df['alcohol'].median()
filtered_df = df[(df['pH'] < mean_pH) & (df['alcohol'] > median_alcohol)]

In [None]:
high_quality_wine = df[df['quality'] >= 7]
low_quality_wine = df[df['quality'] < 7]
high_quality_wine.to_json('high_quality_wine.json')
low_quality_wine.to_json('low_quality_wine.json')
df = pd.read_json('high_quality_wine.json')
print(df.iloc[:2, -2:])
print(df.iloc[-2:, -2:])
bins = [0, 5, 7, 10]
labels = ['low', 'medium', 'high']
df['rating'] = pd.cut(df['quality'], bins=bins, labels=labels)
print(df['rating'].value_counts().idxmax())
dummies_df = pd.get_dummies(df['rating'])
print(dummies_df.sum())

    quality  rating
15        7  medium
17        8    high
      quality  rating
6439        7  medium
6442        7  medium
medium
low         0
medium    862
high      164
dtype: int64
