In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn import metrics

 
import warnings
warnings.filterwarnings('ignore')


In [None]:
stock_file_path = '../input/tesla-stock-data-from-2010-to-2020/TSLA.csv'
stock_data = pd.read_csv(stock_file_path)

stock_data.head()


In [None]:
stock_data.shape

In [None]:
stock_data.describe()

In [None]:
stock_data.info()

In [None]:
# EXPLORATORY DATA ANALYSIS 

plt.figure(figsize=(15,5))
plt.plot(stock_data['Close'])
plt.title('Tesla Close price.', fontsize=15)
plt.ylabel('Price in dollars.')
plt.show()


In [None]:
#Prices of Tesla stock shows upward trend

In [None]:
stock_data.head()

In [None]:
stock_data[stock_data['Close'] == stock_data['Adj Close']].shape
stock_data = stock_data.drop(['Adj Close'], axis=1)

#checking for null values
stock_data.isnull().sum()


In [None]:
features = ['Open', 'High', 'Low', 'Close', 'Volume'] 
plt.subplots(figsize=(20,10))
for i, col in enumerate(features):
  plt.subplot(2,3,i+1)
  sns.distplot(stock_data[col])
plt.show()


In [None]:
plt.subplots(figsize=(20,10))
for i, col in enumerate(features):
  plt.subplot(2,3,i+1)
  sns.boxplot(stock_data[col])
plt.show()


In [None]:
splitted = stock_data['Date'].str.split('-', expand=True)
stock_data['year'] = splitted[0].astype('int')
stock_data['month'] = splitted[1].astype('int')
stock_data['day'] = splitted[2].astype('int')
stock_data.head()


In [None]:
stock_data['is_quarter_end'] = np.where(stock_data['month']%3==0,1,0)
stock_data.head()


In [None]:
numeric_cols = stock_data.select_dtypes(include=['number'])


#Group by 'year' and calculating mean for numeric columns
data_grouped = numeric_cols.groupby(stock_data['year']).mean()


In [None]:
plt.subplots(figsize=(20,10))
for i, col in enumerate(['Open', 'High', 'Low', 'Close']):
  plt.subplot(2,2,i+1)
  data_grouped[col].plot.bar()
plt.show()


In [None]:
#From bar charts, it is clear that the prices are doubled in 2014

In [None]:
# Convert the 'Date' column to datetime
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
# Create a new column 'QuarterYear' to represent the quarter and year
stock_data['QuarterYear'] = stock_data['Date'].dt.to_period('Q')
# Select only numeric columns for mean calculation
numeric_cols = stock_data.select_dtypes(include=['number'])
# Group by 'QuarterYear' and calculate the mean for numeric columns
data_grouped = numeric_cols.groupby(stock_data['QuarterYear']).mean()


In [None]:
stock_data.groupby('is_quarter_end').mean()

In [None]:
stock_data['open-close']  = stock_data['Open'] - stock_data['Close']
stock_data['low-high']  = stock_data['Low'] - stock_data['High']
stock_data['target'] = np.where(stock_data['Close'].shift(-1) > stock_data['Close'], 1, 0)


In [None]:
plt.pie(stock_data['target'].value_counts().values,
        labels=[0, 1], autopct='%1.1f%%')
plt.show()
plt.figure(figsize=(10, 10))

In [None]:
numeric_columns = stock_data.select_dtypes(include=['number'])
corr_matrix = numeric_columns.corr()




plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix > 0.9, annot=True, cbar=False)
plt.show()

In [None]:
#Data Splitting and Normalization
features = stock_data[['open-close', 'low-high', 'is_quarter_end']]
target = stock_data['target']
scaler = StandardScaler()
features = scaler.fit_transform(features)
X_train, X_valid, Y_train, Y_valid = train_test_split(
    features, target, test_size=0.1, random_state=2022)
print(X_train.shape, X_valid.shape)


In [None]:
#Model Development and Evaluation
models = [LogisticRegression(), SVC(
  kernel='poly', probability=True), XGBClassifier()]
for i in range(3):
  models[i].fit(X_train, Y_train)
  print(f'{models[i]} : ')
  print('Training Accuracy : ', metrics.roc_auc_score(
    Y_train, models[i].predict_proba(X_train)[:,1]))
  print('Validation Accuracy : ', metrics.roc_auc_score(
    Y_valid, models[i].predict_proba(X_valid)[:,1]))
  print()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score, classification_report
import matplotlib.pyplot as plt
models = [LogisticRegression(), SVC(kernel='poly', probability=True), XGBClassifier()]
model_names = ["Logistic Regression", "SVC", "XGBoost"]
for model, model_name in zip(models, model_names):
    model.fit(X_train, Y_train)
    
    # Predict probabilities for the positive class
    probas = model.predict_proba(X_valid)[:, 1]
    
    # ROC Curve
    fpr, tpr, thresholds = roc_curve(Y_valid, probas)
    roc_auc = roc_auc_score(Y_valid, probas)
    plt.figure()
    plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc='lower right')
    
   
plt.show()
 
