<a href="https://www.kaggle.com/code/markusayt/src-models-notebookea554506b2?scriptVersionId=162606177" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import torch

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Check for colab or kaggle
try:
    import google.colab
    colab=True
except:
    colab=False

import os
kaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [None]:
#check for internet on in kaggle if the client is run on kaggle
import socket,warnings
if kaggle:

    try:
        socket.setdefaulttimeout(1)
        socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect(('1.1.1.1', 53))
    except socket.error as ex: raise Exception("STOP: No internet. Click '>|' in top right and set 'Internet' switch to on")

In [None]:
#If using FastAI, this will need to be ran to give latest version
"""if colab:
    !pip install -Uqq fastbook
    import fastbook
    fastbook.setup_book()

elif kaggle:
    !pip install -Uqq fastai"""

In [None]:
# Opening from different sources to allow for use of local IDE as well as run in kaggle
# Kaggle and local version is linked through github

from pathlib import Path
if colab:
    from google.colab import drive
    drive.mount("/content/gdrive")
    DATA = Path('/content/gdrive/MyDrive/DAT255/')
    DATA.mkdir(exist_ok=True, parents=True)
    
elif kaggle:
    DATA = Path('/kaggle/input/turbine-data/Onsite-MetMast-SCADA-data-2017.xlsx')
    DATAScada = Path("/kaggle/input/turbine-data/Wind-Turbine-SCADA-signals-2017_0.xlsx")
    DATAFailures = Path("/kaggle/input/turbine-data/opendata-wind-failures-2017.xlsx")

else:
    DATA = "../../data/processed/Onsite-MetMast-SCADA-data-2017.xlsx"
    DATAScada = "../../data/processed/Wind-Turbine-SCADA-signals-2017_0.xlsx"
    DATAFailures = "../../data/processed/opendata-wind-failures-2017.xlsx"

In [None]:
# opening excel file for processed data from 2017
data = pd.read_excel(DATA, engine='openpyxl')

In [None]:
# opening excel file for Scada data from 2017
datascada = pd.read_excel(DATAScada, engine='openpyxl')

### Testing colum sorting with dataframes in pandas
testing how to open and organize data using columns on failure dataset. This dataset is smaller, but is still based on time, so its good to do some tests on this one before the big datasets

In [None]:
# opening excel file for failures of all turbines in 2017
datafailures = pd.read_excel(DATAFailures, engine="openpyxl")

In [None]:
# sort by turbine T07
T07Failures = datafailures[datafailures["Turbine_ID"] == "T07"]
T07Failures # printing to show example of how how it looks. datafailures is only a small file

In [None]:
T07_scada_data = datascada[datascada["Turbine_ID"] == "T07"] # sorting data from T07 turbine

We will now proceed with an inner join to get rid of NaN and missing data. Ideally we should try and do a mode replacement of the data at some point also. See Lesson 5 practical programming video with FastAI

In [None]:
#combining the data with respect to datetime. This is the inner join of enviromental data and turbine data for a respective timestamp
merged_df = pd.merge(T07_scada_data, data, on="Timestamp", how="inner")

merged_df_outer_join = pd.merge(T07_scada_data,data, on="Timestamp", how="outer") #this one will have NaN and other imperfections. Modify at later stage

In [None]:
merged_df_outer_join.isna().sum().to_excel("outer_join.xlsx") #Just checking the scope of missing values. This command sums the amount of missing values for each column

In [None]:
# check title fields
for i in merged_df:
    print(i)

In [None]:
# plotting some data to look for outliers and anomalties in the data itself. Visualize the data
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

#for i in merged_df:
merged_df['Timestamp'] = pd.to_datetime(merged_df['Timestamp'])

ax = merged_df.plot.scatter(x="Timestamp", y="Gen_Bear_Temp_Avg")

# Customize the format of the timestamp on the x-axis
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H:%M:%S'))

# Optionally, set the locator for more control over tick placement
ax.xaxis.set_major_locator(mdates.DayLocator())  # You can adjust the locator based on your data frequency

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
#Define interval for data
plt.xlim(pd.Timestamp('2017-11-15'), pd.Timestamp('2017-12-01'))

# Add labels and title
plt.xlabel('Timestamp')
plt.ylabel('Gen_Bear_Temp_Avg')
plt.title('Scatter Plot of Gen_Bear_Temp_Avg over Time')

# Show the plot
plt.show()


In [None]:
# check the "spread" of the data in a column
#this is interesting because often we do now want a long tail distribution of our data!
# for longtail distributions. make distribution logaritmic
merged_df["Gear_Oil_Temp_Avg"].hist()

### Creating the model:

In [None]:
from torch import tensor

In [None]:


#merged_df.isnull().sum().to_excel('na_summary.xlsx')
merged_df.shape

In [None]:
#import torch
#remove turbine id to have only numbers in dataframe
merged_df = merged_df.dropna()
merged_df = merged_df.drop("Turbine_ID", axis=1)

In [None]:
#dropping columns with NaN and Inf values showing up later

merged_df = merged_df.drop(merged_df.columns[70], axis=1)
merged_df = merged_df.drop(merged_df.columns[70], axis=1)
merged_df = merged_df.drop(merged_df.columns[70], axis=1)


In [None]:


#remove columns where all numbers are 0
cols_to_remove = merged_df.columns[(merged_df == 0).all()]

# Drop those columns from the DataFrame
merged_df_filtered = merged_df.drop(cols_to_remove, axis=1)

#create a list of independent variable names ["name1", "name2"] etc etc
indep_cols = merged_df_filtered.columns.to_list()
indep_cols


In [None]:
#datetime needs to be a able to convert to a float. Making it just a month as a number between 1 and 12
if isinstance(merged_df_filtered["Timestamp"].iloc[2], pd.Timestamp):
    merged_df_filtered['Timestamp'] = merged_df_filtered['Timestamp'].dt.month.astype(float)

In [None]:
merged_df_filtered.shape

In [None]:
#creates a tensor t_indep from merged_df.values for the independent variables and saves them as flaots in torch format
t_indep = tensor(merged_df_filtered[indep_cols].values, dtype=torch.float)
t_indep

In [None]:
# dependent variable
# our dependent is going to be generator bearing average temperature "Gen_Bear_Temp_Avg"

t_dep = tensor(merged_df_filtered.Gen_Bear_Temp_Avg.values)


In [None]:
n_coeff = t_indep.shape[1] #number of coefficients in our model
coeffs = torch.rand(n_coeff)-0.5
coeffs

In [None]:
#checking if compatible with lesson. Jeremy explains broadcasting. 
#t_indep*coeffs

In [None]:
#To improve our model we do the element wise division of the maximum of each column.
# This is to improve the properties of the model when multiplications are being performed by numbers less than 1
# we want all values in the dataset to be "simular size" and is called normalizing


vals, indices = t_indep.max(dim=0)



## print as excel to inspect the data

vals[:]

In [None]:
t_indep = t_indep / vals

In [None]:
#making a prediction with our random coefficients:
preds = (t_indep*coeffs).sum(axis=1)
preds

In [None]:
preds[:1]

In [None]:
loss = torch.abs(preds-t_dep).mean()
loss

In [None]:
t_indep[0]