In [1]:
import pandas as pd
import plotly.express as px

### Train Data Fetching

In [2]:
# Specify the path to your CSV file
csv_file_path = '../train_data.csv'  

# Read the CSV file into a DataFrame
train_df = pd.read_csv(csv_file_path, index_col=False)

# Display the DataFrame
display(train_df)

Unnamed: 0,company,day,pricechange,target
0,1,0,-4.145376,0
1,1,1,-0.770141,0
2,1,2,2.009947,0
3,1,3,-2.692152,0
4,1,4,2.044909,0
...,...,...,...,...
57895,965,55,0.051738,1
57896,965,56,2.146372,1
57897,965,57,3.088611,1
57898,965,58,2.210210,1


### Fetching Testing data

In [3]:
# Specify the path to your CSV file
csv_file_path = '../test_data.csv'  

# Read the CSV file into a DataFrame
test_df = pd.read_csv(csv_file_path, index_col=False)

# Display the DataFrame
display(test_df)

Unnamed: 0,company,day,pricechange,target
0,1,0,-0.393235,0
1,1,1,1.029870,0
2,1,2,1.597008,0
3,1,3,-0.317723,0
4,1,4,-1.476264,0
...,...,...,...,...
57955,966,55,0.322405,1
57956,966,56,-1.943151,1
57957,966,57,0.299330,1
57958,966,58,0.300461,1


In [4]:
# Check for null values in the DataFrame
null_values = test_df.isnull()

# Get the count of null values for each column
null_count = null_values.sum()

# Display the count of null values for each column
print(null_count)

company        0
day            0
pricechange    0
target         0
dtype: int64


In [5]:
# Check for null values in the DataFrame
null_values = train_df.isnull()

# Get the count of null values for each column
null_count = null_values.sum()

# Display the count of null values for each column
print(null_count)

company        0
day            0
pricechange    0
target         0
dtype: int64


In [6]:
# round the price change feature
train_df['pricechange'] = train_df['pricechange'].astype(float).round(3)
display(train_df)

Unnamed: 0,company,day,pricechange,target
0,1,0,-4.145,0
1,1,1,-0.770,0
2,1,2,2.010,0
3,1,3,-2.692,0
4,1,4,2.045,0
...,...,...,...,...
57895,965,55,0.052,1
57896,965,56,2.146,1
57897,965,57,3.089,1
57898,965,58,2.210,1


In [7]:
# round the price change feature
test_df['pricechange'] = test_df['pricechange'].astype(float).round(3)
display(test_df)

Unnamed: 0,company,day,pricechange,target
0,1,0,-0.393,0
1,1,1,1.030,0
2,1,2,1.597,0
3,1,3,-0.318,0
4,1,4,-1.476,0
...,...,...,...,...
57955,966,55,0.322,1
57956,966,56,-1.943,1
57957,966,57,0.299,1
57958,966,58,0.300,1


In [8]:
class0_train_data= train_df[train_df['target']==0]
display(class0_train_data)

Unnamed: 0,company,day,pricechange,target
0,1,0,-4.145,0
1,1,1,-0.770,0
2,1,2,2.010,0
3,1,3,-2.692,0
4,1,4,2.045,0
...,...,...,...,...
39775,663,55,-0.980,0
39776,663,56,0.159,0
39777,663,57,1.136,0
39778,663,58,-0.010,0


In [9]:
class1_train_data= train_df[train_df['target']==1]
display(class1_train_data)

Unnamed: 0,company,day,pricechange,target
39780,664,0,-1.124,1
39781,664,1,-0.320,1
39782,664,2,-4.751,1
39783,664,3,-0.200,1
39784,664,4,-3.013,1
...,...,...,...,...
57895,965,55,0.052,1
57896,965,56,2.146,1
57897,965,57,3.089,1
57898,965,58,2.210,1


In [27]:
#!pip install plotly



You should consider upgrading via the 'C:\Python310\python.exe -m pip install --upgrade pip' command.


In [12]:
# Create a histogram using Plotly Express
fig = px.histogram(class0_train_data, x='pricechange', nbins=500, title='Histogram of Price Change (Class 0)')
fig.show()

In [11]:
# Create a histogram using Plotly Express
fig = px.histogram(class1_train_data, x='pricechange', nbins=500, title='Histogram of Price Change (Class 1)')
fig.show()

In [13]:
column_stats = class1_train_data['pricechange'].describe()
display(column_stats)

count    18120.000000
mean         0.049715
std          1.949931
min        -24.229000
25%         -0.834000
50%          0.068000
75%          0.952000
max         30.482000
Name: pricechange, dtype: float64

In [14]:
column_stats = class0_train_data['pricechange'].describe()
display(column_stats)

count    39780.000000
mean         0.096668
std          1.671930
min        -24.809000
25%         -0.694000
50%          0.086000
75%          0.895000
max         21.140000
Name: pricechange, dtype: float64

In [15]:
company1_df=train_df[train_df['company']==1];
display(company1_df.head(4))

Unnamed: 0,company,day,pricechange,target
0,1,0,-4.145,0
1,1,1,-0.77,0
2,1,2,2.01,0
3,1,3,-2.692,0


In [16]:
# company-1 is from class-label =>0
# line plot of price-change in 60 day interval

fig = px.line(company1_df, x='day', y='pricechange', title='Price Change Over Days')
fig.show()

In [17]:
company2_df=train_df[train_df['company']==664];
display(company2_df.head(4))

Unnamed: 0,company,day,pricechange,target
39780,664,0,-1.124,1
39781,664,1,-0.32,1
39782,664,2,-4.751,1
39783,664,3,-0.2,1


In [18]:
# company-1 is from class-label =>0
# line plot of price-change in 60 day interval

fig = px.line(company2_df, x='day', y='pricechange', title='Price Change Over Days')
fig.show()

In [19]:
# Calculate autocorrelation for lag 1 to 10
lags = range(1, 11)
autocorrelation_results = []

for lag in lags:
    autocorr_value = company2_df['pricechange'].autocorr(lag=lag)
    autocorrelation_results.append((lag, autocorr_value))

# Create a DataFrame for the autocorrelation results
autocorr_df = pd.DataFrame(autocorrelation_results, columns=['Lag', 'Autocorrelation'])

# Create a line plot using Plotly Express
fig = px.line(autocorr_df, x='Lag', y='Autocorrelation', title='Autocorrelation of Price Change')
fig.show()

In [20]:
# Calculate autocorrelation for lag 1 to 10
lags = range(1, 11)
autocorrelation_results = []

for lag in lags:
    autocorr_value = company1_df['pricechange'].autocorr(lag=lag)
    autocorrelation_results.append((lag, autocorr_value))

# Create a DataFrame for the autocorrelation results
autocorr_df = pd.DataFrame(autocorrelation_results, columns=['Lag', 'Autocorrelation'])

# Create a line plot using Plotly Express
fig = px.line(autocorr_df, x='Lag', y='Autocorrelation', title='Autocorrelation of Price Change')
fig.show()

In [21]:
# total 965 companies
# class0 companies=> 663
# class1 companies => 302

lags = range(1, 11)
autocorrelation_results = []

for company in range(1,664):
    company_df=class0_train_data[class0_train_data['company']==company];

    for lag in lags:
        autocorr_value = company_df['pricechange'].autocorr(lag=lag)
        autocorrelation_results.append((company, lag, autocorr_value))

In [22]:
# Create a DataFrame for the autocorrelation results
autocorr_df = pd.DataFrame(autocorrelation_results, columns=['Company', 'Lag', 'Autocorrelation'])
 
# Calculate the mean autocorrelation for each lag across all companies
mean_autocorr_df = autocorr_df.groupby('Lag')['Autocorrelation'].mean().reset_index()

# Create a line plot using Plotly Express
fig = px.line(mean_autocorr_df, x='Lag', y='Autocorrelation', title='Mean Autocorrelation of Price Change')
fig.show()

In [23]:
# total 965 companies
# class0 companies=> 663
# class1 companies => 302

lags = range(1, 11)
autocorrelation_results = []

for company in range(664,966):
    company_df=class1_train_data[class1_train_data['company']==company];

    for lag in lags:
        autocorr_value = company_df['pricechange'].autocorr(lag=lag)
        autocorrelation_results.append((company, lag, autocorr_value))

In [24]:
# Create a DataFrame for the autocorrelation results
autocorr_df = pd.DataFrame(autocorrelation_results, columns=['Company', 'Lag', 'Autocorrelation'])

# Calculate the mean autocorrelation for each lag across all companies
mean_autocorr_df = autocorr_df.groupby('Lag')['Autocorrelation'].mean().reset_index()

# Create a line plot using Plotly Express
fig = px.line(mean_autocorr_df, x='Lag', y='Autocorrelation', title='Mean Autocorrelation of Price Change')
fig.show()