<a href="https://colab.research.google.com/github/HarshVarshney0001/Stock-price-prediction/blob/main/python_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gdown



#Project Report: Stock Price Prediction Using Linear Regression with Polynomial Features

In [None]:
# step 1: Data Loading and Exploration

!gdown https://drive.google.com/uc?id={'19uOraLpweUSWUCNUkjIS7-_yat__mn3x'}
import pandas as pd
df = pd.read_csv('AAPL Historical Data.csv')
df.head(6)

Downloading...
From: https://drive.google.com/uc?id=19uOraLpweUSWUCNUkjIS7-_yat__mn3x
To: /content/AAPL Historical Data.csv
  0% 0.00/83.3k [00:00<?, ?B/s]100% 83.3k/83.3k [00:00<00:00, 61.3MB/s]


Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,11/06/2024,222.72,222.61,226.07,221.19,54.56M,-0.33%
1,11/05/2024,223.45,221.79,223.95,221.14,28.11M,0.65%
2,11/04/2024,222.01,220.99,222.79,219.71,44.94M,-0.40%
3,11/01/2024,222.91,220.96,225.35,220.27,65.28M,-1.33%
4,10/31/2024,225.91,229.34,229.83,225.37,64.37M,-1.82%
5,10/30/2024,230.1,232.61,233.47,229.55,47.07M,-1.53%


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Step 2: Data Exploration and Cleaning
df['Date'] = pd.to_datetime(df['Date'])
df.rename(columns={'Vol.': 'Volume'}, inplace=True)

# Ensure 'Volume' is treated as a string, and clean any commas before handling 'M' and 'K' suffixes.
df['Volume'] = df['Volume'].astype(str).str.replace(',', '', regex=False)  # Remove commas

# Define a function to convert Volume to float based on suffix
def convert_volume(volume):
    if 'M' in volume:
        return float(volume.replace('M', '')) * 1e6
    elif 'K' in volume:
        return float(volume.replace('K', '')) * 1e3
    else:
        return pd.to_numeric(volume, errors='coerce')  # Convert to NaN if conversion fails

# Apply the conversion function to the 'Volume' column
df['Volume'] = df['Volume'].apply(convert_volume)

# Drop any rows with NaN values after conversion
df = df[['Date', 'Open', 'High', 'Low', 'Price', 'Volume']].dropna()

# Step 3: Feature Engineering
df['Avg_Price'] = (df['High'] + df['Low']) / 2
df['Date_ordinal'] = df['Date'].map(pd.Timestamp.toordinal)
X = df[['Date_ordinal', 'Volume', 'Open']]
y = df['Avg_Price']

# Step 34: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Pipeline Construction
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    ('regressor', LinearRegression())
])

# Step 6: Model Training
pipeline.fit(X_train, y_train)

# Step 7: Model Prediction and Evaluation on Test Data
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

# step 8: Displaying predictions on the test set
test_results = pd.DataFrame({'Date': df.loc[y_test.index, 'Date'],
                             'Actual Avg_Price': y_test,
                             'Predicted Avg_Price': y_pred})
print(test_results.head())

# Step 9: Making Predictions on New Data (Example)
new_date = pd.Timestamp('2024-11-07').toordinal()
new_volume = 80_000_000  # Replace with the estimated or known volume
new_open_price = 180.0   # Replace with the estimated or known open price

# Formulate new data for prediction
new_data = pd.DataFrame([[new_date, new_volume, new_open_price]],
                        columns=['Date_ordinal', 'Volume', 'Open'])

# Predict using the pipeline
new_prediction = pipeline.predict(new_data)
print(f"Predicted Avg Price for new data: {new_prediction[0]}")


Mean Absolute Error (MAE): 0.9639684096350817
Root Mean Squared Error (RMSE): 1.2464184155950697
           Date  Actual Avg_Price  Predicted Avg_Price
561  2022-08-15           172.370           171.540790
101  2024-06-13           214.175           215.142647
51   2024-08-26           225.585           226.651848
63   2024-08-08           211.515           213.225273
1073 2020-08-03           109.765           107.851648
Predicted Avg Price for new data: 180.67284855191656
