In [1]:
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
# Open the dataset. Since the example dataset is an example I made, EDA is not required in this case but for any dataset obtained from other sources, an EDA is a must.
# y1 is sin(x) and y2 is cos(x) so this information is known to us.

dataset = pd.read_csv("example.csv")
dataset.head() # Show the first five results.

Unnamed: 0,x,sin_x,cos_x,s_pos,c_pos
0,0.01,0.01,0.99995,True,True
1,0.02,0.019999,0.9998,True,True
2,0.03,0.029996,0.99955,True,True
3,0.04,0.039989,0.9992,True,True
4,0.05,0.049979,0.99875,True,True


In [3]:
# We do not need the s_pos and c_pos so remove them.
# Let's say the purpose is to model tan_x and x is not needed so remove x as well.

dataset.drop(columns=["s_pos", "c_pos", "x"], inplace=True)
dataset.head()

Unnamed: 0,x,sin_x,cos_x
0,0.01,0.01,0.99995
1,0.02,0.019999,0.9998
2,0.03,0.029996,0.99955
3,0.04,0.039989,0.9992
4,0.05,0.049979,0.99875


In [4]:
# We can add a process by getting tan(x).
# We know tan(x) = sin(x) / cos(x).

dataset["tan_x"] = dataset.apply(lambda x: x["sin_x"] / x["cos_x"] if x["cos_x"] != 0 else "undefined", axis=1)

dataset.head()

Unnamed: 0,x,sin_x,cos_x,tan_x
0,0.01,0.01,0.99995,0.01
1,0.02,0.019999,0.9998,0.020003
2,0.03,0.029996,0.99955,0.030009
3,0.04,0.039989,0.9992,0.040021
4,0.05,0.049979,0.99875,0.050042


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       500 non-null    float64
 1   sin_x   500 non-null    float64
 2   cos_x   500 non-null    float64
 3   tan_x   500 non-null    float64
dtypes: float64(4)
memory usage: 15.8 KB


In [6]:
# There is no undefined value in tan_x. So we can go straight to making the training and testing data.

X_train, X_test, y_train, y_test = train_test_split(dataset[["sin_x", "cos_x"]], dataset["tan_x"], test_size=0.2)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 455 to 229
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sin_x   400 non-null    float64
 1   cos_x   400 non-null    float64
dtypes: float64(2)
memory usage: 9.4 KB


In [8]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 312 to 389
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sin_x   100 non-null    float64
 1   cos_x   100 non-null    float64
dtypes: float64(2)
memory usage: 2.3 KB


In [9]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 400 entries, 455 to 229
Series name: tan_x
Non-Null Count  Dtype  
--------------  -----  
400 non-null    float64
dtypes: float64(1)
memory usage: 6.2 KB


In [10]:
y_test.info()

<class 'pandas.core.series.Series'>
Int64Index: 100 entries, 312 to 389
Series name: tan_x
Non-Null Count  Dtype  
--------------  -----  
100 non-null    float64
dtypes: float64(1)
memory usage: 1.6 KB
