# Importing Pandas library

In [1]:
import pandas as pd

# Load the CSV file into a Pandas DataFrame

In [2]:
file_path = '01.Data Cleaning and Preprocessing.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame

In [3]:
print("Initial DataFrame:")
print(df.head())

Initial DataFrame:
  Observation  Y-Kappa  ChipRate  BF-CMratio  BlowFlow  ChipLevel4   \
0    31-00:00    23.10    16.520     121.717  1177.607      169.805   
1    31-01:00    27.60    16.810      79.022  1328.360      341.327   
2    31-02:00    23.19    16.709      79.562  1329.407      239.161   
3    31-03:00    23.60    16.478      81.011  1334.877      213.527   
4    31-04:00    22.90    15.618      93.244  1334.168      243.131   

   T-upperExt-2   T-lowerExt-2    UCZAA  WhiteFlow-4   ...  SteamFlow-4   \
0        358.282         329.545  1.443       599.253  ...        67.122   
1        351.050         329.067  1.549       537.201  ...        60.012   
2        350.022         329.260  1.600       549.611  ...        61.304   
3        350.938         331.142  1.604       623.362  ...        68.496   
4        351.640         332.709    NaN       638.672  ...        70.022   

   Lower-HeatT-3  Upper-HeatT-3   ChipMass-4   WeakLiquorF   BlackFlow-2   \
0        329.432    

# Filtering data: Example - Filter rows where 'Y-Kappa' is greater than 25

In [4]:
filtered_df = df[df['Y-Kappa'] > 25]
print("\nFiltered DataFrame (Y-Kappa > 25):")
print(filtered_df.head())



Filtered DataFrame (Y-Kappa > 25):
   Observation  Y-Kappa  ChipRate  BF-CMratio  BlowFlow  ChipLevel4   \
1     31-01:00    27.60    16.810      79.022  1328.360      341.327   
12    31-11:00    26.62    15.467      84.447  1334.255      386.971   
13    31-12:00    27.20    16.083      82.839  1332.331      366.855   
15    31-14:00    25.40    16.425      72.924  1197.775      118.821   
40     1-15:00    27.10    13.558      83.117  1175.417      289.256   

    T-upperExt-2   T-lowerExt-2    UCZAA  WhiteFlow-4   ...  SteamFlow-4   \
1         351.050         329.067  1.549       537.201  ...        60.012   
12        349.392         321.021  1.428       531.250  ...        59.407   
13        350.094         327.439  1.486       527.893  ...        60.271   
15        350.765         329.799  1.635       585.011  ...        65.474   
40        339.168         318.386  1.360       480.184  ...        48.568   

    Lower-HeatT-3  Upper-HeatT-3   ChipMass-4   WeakLiquorF   BlackF

# Handling missing values: Example - Fill missing values with the mean of the column for numeric columns only

In [5]:
numeric_cols = df.select_dtypes(include=['number']).columns
df_filled = df.copy()
df_filled[numeric_cols] = df_filled[numeric_cols].fillna(df_filled[numeric_cols].mean())
print("\nDataFrame after filling missing values with column mean (numeric columns only):")
print(df_filled.head())


DataFrame after filling missing values with column mean (numeric columns only):
  Observation  Y-Kappa  ChipRate  BF-CMratio  BlowFlow  ChipLevel4   \
0    31-00:00    23.10    16.520     121.717  1177.607      169.805   
1    31-01:00    27.60    16.810      79.022  1328.360      341.327   
2    31-02:00    23.19    16.709      79.562  1329.407      239.161   
3    31-03:00    23.60    16.478      81.011  1334.877      213.527   
4    31-04:00    22.90    15.618      93.244  1334.168      243.131   

   T-upperExt-2   T-lowerExt-2      UCZAA  WhiteFlow-4   ...  SteamFlow-4   \
0        358.282         329.545  1.44300       599.253  ...        67.122   
1        351.050         329.067  1.54900       537.201  ...        60.012   
2        350.022         329.260  1.60000       549.611  ...        61.304   
3        350.938         331.142  1.60400       623.362  ...        68.496   
4        351.640         332.709  1.49201       638.672  ...        70.022   

   Lower-HeatT-3  Upper

# Calculating summary statistics for numeric columns

In [6]:
summary_stats = df_filled.describe()
print("\nSummary Statistics (numeric columns only):")
print(summary_stats)



Summary Statistics (numeric columns only):
          Y-Kappa    ChipRate  BF-CMratio     BlowFlow  ChipLevel4   \
count  324.000000  324.000000  324.000000   324.000000   324.000000   
mean    20.635370   14.347937   87.464456  1237.837614   258.164483   
std      3.070036    1.487447    7.781774    98.070606    87.851143   
min     12.170000    9.983000   68.645000     0.000000     0.000000   
25%     18.382500   13.364750   82.156750  1194.525750   213.527000   
50%     20.845000   14.347937   87.253500  1254.658500   271.605500   
75%     23.032500   15.498250   92.123250  1288.628750   321.285000   
max     27.600000   16.958000  121.717000  1351.240000   419.014000   

       T-upperExt-2   T-lowerExt-2         UCZAA  WhiteFlow-4   AAWhiteSt-4   \
count     324.000000       324.00000  324.000000     324.00000    324.000000   
mean      356.904295       324.02018    1.492010     591.73226      6.140410   
std         9.180734         7.59777    0.101741      66.91253      0.059553