<a href="https://colab.research.google.com/github/MostaryKhatun1/Project-with-Python-Django/blob/main/BodyWeight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

connect drive to colab



In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Step 2: Load datasets from Google Drive
import pandas as pd

# Load the main dataset (Replace 'main_data.csv' with the correct path to your file)
main_df = pd.read_csv('/content/drive/MyDrive/weight.csv')

# Load the additional dataset (Replace 'additional_data.csv' with the correct path to your file)
additional_df = pd.read_csv('/content/drive/MyDrive/weight_test.csv')

# Display the first few rows to verify that data is loaded correctly
print("Main Dataset:")
print(main_df.head())

print("\nAdditional Dataset:")
print(additional_df.head())


Main Dataset:
   Age(Week)  Consumption in g/bird/day  BodyWeightAvg (g)  \
0          1                         12               67.5   
1          2                         18              115.0   
2          3                         26              195.0   
3          4                         33              287.5   
4          5                         38              390.0   

   Cumulative g to date  
0                    84  
1                   210  
2                   392  
3                   623  
4                   889  

Additional Dataset:
   Age(Week)  Consumption in g/bird/day  BodyWeightAvg (g)
0         13                         70             1145.0
1         14                         73             1222.5
2         15                         76             1305.0
3         16                         79             1387.5
4         17                         81             1470.0


Append or Merge Additional Data to the Main Dataset

In [None]:
# Step 3a: Append additional data using pd.concat (for new rows)
combined_df = pd.concat([main_df, additional_df], ignore_index=True)

# Display the combined dataset to verify
print("Combined Dataset after appending:")
print(combined_df.tail())


Combined Dataset after appending:
    Age(Week)  Consumption in g/bird/day  BodyWeightAvg (g)  \
13         14                         73             1222.5   
14         15                         76             1305.0   
15         16                         79             1387.5   
16         17                         81             1470.0   
17         18                         83             1540.0   

    Cumulative g to date  
13                   NaN  
14                   NaN  
15                   NaN  
16                   NaN  
17                   NaN  


Handling Missing Values

In [None]:
# Step 4: Handling missing values
from sklearn.impute import SimpleImputer

# Check for missing values
print("Missing values before imputation:")
print(combined_df.isnull().sum())

# Impute missing values with the median for numeric columns
imputer = SimpleImputer(strategy='median')
combined_df[['Consumption in g/bird/day', 'BodyWeightAvg (g)']] = imputer.fit_transform(
    combined_df[['Consumption in g/bird/day', 'BodyWeightAvg (g)']])

# Check for missing values after imputation
print("Missing values after imputation:")
print(combined_df.isnull().sum())


Missing values before imputation:
Age(Week)                    0
Consumption in g/bird/day    0
BodyWeightAvg (g)            0
Cumulative g to date         6
dtype: int64
Missing values after imputation:
Age(Week)                    0
Consumption in g/bird/day    0
BodyWeightAvg (g)            0
Cumulative g to date         6
dtype: int64


 Detect and Replace Outliers

In [None]:
# Step 5: Outlier detection and replacement using Z-score
import numpy as np
from scipy import stats

# Calculate Z-scores for outlier detection
z_scores = np.abs(stats.zscore(combined_df[['Consumption in g/bird/day', 'BodyWeightAvg (g)']]))
threshold = 3  # Typical threshold for outliers

# Identify outliers (where Z-score > threshold)
outliers_mask = (z_scores > threshold)

# Count and display the number of outliers detected for each column
outliers_count = outliers_mask.sum(axis=0)
print("Number of outliers detected:")
for col, count in zip(['Consumption in g/bird/day', 'BodyWeightAvg (g)'], outliers_count):
    print(f"{col}: {count}")

# Replace outliers with the median value for each column if any outliers are found
for col in ['Consumption in g/bird/day', 'BodyWeightAvg (g)']:
    # Only replace if there are outliers detected in the column
    if outliers_count[['Consumption in g/bird/day', 'BodyWeightAvg (g)'].index(col)] > 0:
        median = combined_df[col].median()
        combined_df.loc[outliers_mask[:, combined_df.columns.get_loc(col)], col] = median
        print(f"Replaced outliers in {col} with median: {median}")

# Verify that outliers have been replaced (if there were any)
print("Descriptive statistics after outlier replacement:")
print(combined_df.describe())


Number of outliers detected:
Consumption in g/bird/day: 0
BodyWeightAvg (g): 0
Descriptive statistics after outlier replacement:
       Age(Week)  Consumption in g/bird/day  BodyWeightAvg (g)  \
count  18.000000                  18.000000          18.000000   
mean    9.500000                  54.277778         807.222222   
std     5.338539                  22.183209         482.100100   
min     1.000000                  12.000000          67.500000   
25%     5.250000                  39.250000         414.375000   
50%     9.500000                  57.000000         820.000000   
75%    13.750000                  72.250000        1203.125000   
max    18.000000                  83.000000        1540.000000   

       Cumulative g to date  
count             12.000000  
mean            1537.083333  
std             1178.593573  
min               84.000000  
25%              565.250000  
50%             1354.500000  
75%             2364.250000  
max             3605.000000  


  if outliers_count[['Consumption in g/bird/day', 'BodyWeightAvg (g)'].index(col)] > 0:


Generate a Target Variable for Machine Learning

In [None]:
# Step 6: Create a target variable (body weight at 18 weeks)
target_weight = 1540  # Target weight based on your goal for week 18

# Add a new 'Target' column
combined_df['Target'] = target_weight

# Display the last few rows to verify that the 'Target' column has been added
print(combined_df.tail())


    Age(Week)  Consumption in g/bird/day  BodyWeightAvg (g)  \
13         14                       73.0             1222.5   
14         15                       76.0             1305.0   
15         16                       79.0             1387.5   
16         17                       81.0             1470.0   
17         18                       83.0             1540.0   

    Cumulative g to date  Target  
13                   NaN    1540  
14                   NaN    1540  
15                   NaN    1540  
16                   NaN    1540  
17                   NaN    1540  


 Save the Preprocessed Data

In [None]:
# Step 7: Save the preprocessed dataset
combined_df.to_csv('/content/drive/MyDrive/preprocessed_combined_data.csv', index=False)

# Optionally, download the file locally
from google.colab import files
files.download('/content/drive/MyDrive/preprocessed_combined_data.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Build a Machine Learning Model (Optional)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Assuming 'Target' is the column for the desired target value, define it based on your criteria
combined_df['Target'] = combined_df['BodyWeightAvg (g)']  # Replace with your target logic

# Step 8: Split the data into features (X) and target (y)
X = combined_df.drop(columns=['Target'])  # Features are all columns except 'Target'
y = combined_df['Target']  # Target is the 'Target' column

# Step 8a: Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent'
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set
score = model.score(X_test, y_test)
print(f"Model R^2 score: {score}")

# Optional: Make predictions
predictions = model.predict(X_test)
print("Predictions on test set:")
print(predictions)


Model R^2 score: 1.0
Predictions on test set:
[ 67.5 115.  770.  487.5]
