In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
test_dataset = pd.read_csv('Datasets/VanilaDataset/test_patterns_with_symbols.csv')
train_dataset = pd.read_csv('Datasets/VanilaDataset/train_patterns_with_symbols.csv')

file_path = "Datasets/VanilaDataset"
train_dataset_processed = pd.read_csv(file_path + "/trainDataset_w_aug.csv", index_col=[0, 1])
test_dataset_processed = pd.read_csv(file_path + "/testDataset_w_aug.csv" , index_col=[0, 1])

# combine the datasets
dataset = pd.concat([train_dataset, test_dataset])
dataset

In [None]:
# sort by Start 
dataset = dataset.sort_values(by='Start')
dataset.reset_index(drop=True, inplace=True)
# coonvert Start and End to datetime
dataset['Start'] = pd.to_datetime(dataset['Start'])
dataset['End'] = pd.to_datetime(dataset['End'])
dataset

In [None]:
# get a list of all the unique symbols
symbols = dataset['Symbol'].unique()
symbols

# Creating a Dataset with Lable less segments


In [None]:
from tqdm import tqdm

num_of_unlabled_seg =10000
# Set a common random seed
SEED = 69
np.random.seed(SEED)

no_pattern_rows = []

with tqdm(total=num_of_unlabled_seg, desc="Processing Segments") as pbar:
    while num_of_unlabled_seg > 0:
        # get a random symbol from the list of symbols
        random_symbol = np.random.choice(symbols)
        # get the dataset for the random symbol
        symbol_dataset = dataset[dataset['Symbol'] == random_symbol]
        symbol_dataset = symbol_dataset.sort_values(by='Start')
        symbol_dataset.reset_index(drop=True, inplace=True)
        if(len(symbol_dataset)>1):
            # get a random index from the symbol dataset
            random_index = np.random.randint(0, len(symbol_dataset)-1)
            # get the random segment
            random_start_segment = symbol_dataset.iloc[random_index]
            random_end_segment = symbol_dataset.iloc[random_index + 1]
            
            # get the date range between the End date of the start segment and the Start date of the end segment
            num_of_dates = random_end_segment['Start'] - random_start_segment['End']
            num_of_dates = num_of_dates.days
            
            if num_of_dates > 10:
                min_start_date = random_start_segment['End']
                max_end_date = random_end_segment['Start']
                
                # print("min_start_date: ", min_start_date , "max_end_date: ", max_end_date,"date len",(max_end_date - min_start_date).days)
                
                # get a random start and end date between the min and max date
                random_start_date_i = np.random.randint(0, num_of_dates-10)
                random_start_date = min_start_date + pd.DateOffset(days=random_start_date_i)
                
                if((num_of_dates+1) -(random_start_date_i) > 100 ):
                    random_end_right_margin = random_start_date_i +100
                else:
                    random_end_right_margin = num_of_dates+1
                
                random_end_date_i = np.random.randint(random_start_date_i +10, random_end_right_margin)
                random_end_date = min_start_date + pd.DateOffset(days=random_end_date_i)
                
                # print("random_start_date: ", random_start_date , "random_end_date: ", random_end_date,"date len",(random_end_date - random_start_date).days)

                # check if there is an segment already in the no_pattern_rows list that intersects with the random segment
                # print('start checking ')
                intersect = False
                for row in no_pattern_rows:
                    if (random_symbol == row['Symbol']) and ((random_start_date >= row['Start'] and random_start_date <= row['End']) or (random_end_date >= row['Start'] and random_end_date <= row['End'])):
                        intersect_len = (min(random_end_date, row['End']) - max(random_start_date, row['Start'])).days
                        intersect_percent = intersect_len / (random_end_date - random_start_date).days
                        if intersect_percent > 0.2:
                            intersect = True
                            # print('Intersecting segment found in symbol: ', random_symbol," and ",row['Symbol'], 'date range ', random_start_date, random_end_date ,' and ', row['Start'], row['End'])
                            break
                
                # print('end checking ')
                
                if not intersect:
                    row = {
                        'Symbol': random_symbol,
                        'Chart Pattern': 'No Pattern',
                        'BullishBearish' : random_start_segment['BullishBearish'],
                        'Start': random_start_date,
                        'End': random_end_date, 
                        'Industry' : random_start_segment['Industry'],   
                        'Pattern_Length' : (random_end_date - random_start_date).days
                    }
                    
                    no_pattern_rows.append(row)
                    num_of_unlabled_seg -= 1
                    pbar.update(1)
                    
                

no_pattern_df = pd.DataFrame(no_pattern_rows)

In [None]:
# update the no_pattern_df Pattern_Length colum with correct values
# loop through each row in the no_pattern_df
for index, row in no_pattern_df.iterrows():
    symbol = row['Symbol']
    start_date = row['Start']
    end_date = row['End']
    # set to date time 
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    # read the OHLC data for the symbol from file
    ohlc_data = pd.read_csv(f'Datasets/OHLC data/{symbol}.csv')
    # set the Date column to datetime
    ohlc_data['Date'] = pd.to_datetime(ohlc_data['Date'])
    ohlc_data['Date'] = ohlc_data['Date'].dt.tz_localize(None)

    # get the ohlc data between the start and end date
    ohlc_data = ohlc_data[(ohlc_data['Date'] >= start_date) & (ohlc_data['Date'] <= end_date)]
    # get the length of the ohlc data
    pattern_length = len(ohlc_data)
    # update the Pattern_Length column with the correct value
    no_pattern_df.at[index, 'Pattern_Length'] = pattern_length

no_pattern_df

In [None]:
no_pattern_df

In [None]:
# save no_pattern_df to csv 
no_pattern_df.to_csv("Datasets/VanilaDataset/no_pattern_10000_df.csv")    

In [None]:
from utils.formatAndPreprocess import dataset_format

#  create formatted data set for the train and test data
no_pattern_dataset,instance_index_mapping = dataset_format(no_pattern_df,give_instance_index_mapping=True)
no_pattern_dataset , instance_index_mapping

In [None]:
train_dataset_processed

In [None]:
train_dataset_processed_copy = train_dataset_processed.copy()
# make a new column with the same values as the Instance index(Level 0) values
train_dataset_processed_copy['Instance'] = train_dataset_processed_copy.index.get_level_values(0)
train_dataset_processed_copy

In [None]:
Instance_Lits = [2,1,0,6406]
# get the data set where the Instance is in the list
train_dataset_processed_copy[train_dataset_processed_copy['Instance'].isin(Instance_Lits)]

# PU Bagging Model


In [None]:
import time
from sklearn.pipeline import make_pipeline
from sktime.transformations.panel.rocket import MiniRocketMultivariate
from xgboost import XGBClassifier
from utils.FixedLengthTransformer import FixedLengthTransformer

def train_mini_rcket(X_train,y_train) :
    fl = FixedLengthTransformer(fixed_length=100, fill_value=0)
    mini_rocket = MiniRocketMultivariate(num_kernels=10000)
    xgbmr = XGBClassifier(
        eval_metric='mlogloss', 
        n_estimators=100,
    )

    clf_mini_rocket_xgb = make_pipeline(
        fl,
        mini_rocket,
        xgbmr
    )
    clf_mini_rocket_xgb.fit(X_train, y_train)

    return clf_mini_rocket_xgb

In [None]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# - Create a training set by combining au positive data points With a random 
#   sample from the unlabeled points. With replacement.
# - Build a classifier from this "bootstrap" sample. treating positive and unlabeled 
#   data points as positives and negatives. respectively,
# - Apply the classifier to whatever the unlabeled data points were not included in the 
#   random sample - hereafter called OOB (Out of Bag) points — and record their scores.
# - Repeat the three Steps above many times and finally assign to each point the average 
#   Of OOB scores it has received.

# get the unique value list of level 0 index Instance
unique_unlabeled_instance = no_pattern_dataset.index.get_level_values(0).unique()
# for each data point , keep track of how many times it has ben OOB and the sum of the probabilities assigned to it
oob_no_pattern_df = pd.DataFrame(index=unique_unlabeled_instance, columns=['oob_count', 'oob_sum'])
oob_no_pattern_df['oob_count'] = 0
oob_no_pattern_df['oob_sum'] = 0

n_estimators = 250


# Create a tqdm object and assign it to a variable
progress_bar = tqdm(range(n_estimators), desc="Training Progress", unit="iteration")

for i in progress_bar:
    # get a 20% sample of Instance index from the instance_list_unlabled with replacement 
    sample_train_instances = np.random.choice(unique_unlabeled_instance, size=int(0.2 * len(unique_unlabeled_instance)), replace=True)
    sample_test_instances = unique_unlabeled_instance [~np.isin(unique_unlabeled_instance, sample_train_instances)]
    # get the data set where the Instance is in the list
    sample_train = no_pattern_dataset[no_pattern_dataset.index.get_level_values(0).isin(sample_train_instances)]
    sample_test = no_pattern_dataset[no_pattern_dataset.index.get_level_values(0).isin(sample_test_instances)]

    sample_test = sample_test.copy()  # Make an explicit copy
    sample_test['Instance'] = sample_test.index.get_level_values(0)


    # Extract Instance and Time indices from sample_test
    instance_idx = sample_test.index.get_level_values(0)
    time_idx = sample_test.index.get_level_values(1)

    # Create new Instance index starting from max_instance + 1
    new_instance_idx = pd.factorize(instance_idx)[0]  

    # Reconstruct the MultiIndex with updated Instance indices
    sample_test.index = pd.MultiIndex.from_arrays([new_instance_idx, time_idx], names=["Instance", "Time"])


    # create a mapping dictionary from the old Instance index to the new Instance index
    test_inst_mapping_dict = dict(zip(new_instance_idx,instance_idx))
    # print(test_inst_mapping_dict)


    # print("Sampek test :",sample_test)
    # print(sample_train)

    # get the  largest Instance index in the train_dataset_processed
    max_instance = train_dataset_processed.index.get_level_values(0).max()

    # Extract Instance and Time indices from sample_train
    instance_idx = sample_train.index.get_level_values(0)
    time_idx = sample_train.index.get_level_values(1)

    # Create new Instance index starting from max_instance + 1
    new_instance_idx = pd.factorize(instance_idx)[0] + max_instance + 1  # Ensure unique new indices

    # Reconstruct the MultiIndex with updated Instance indices
    sample_train.index = pd.MultiIndex.from_arrays([new_instance_idx, time_idx], names=["Instance", "Time"])



    # print(sample_train.index.get_level_values(0).unique())
    # print(len(sample_train.index.get_level_values(0).unique()))
    # print(sample_train)

    # concatanate the sample with the train_dataset_processed data
    train_data = pd.concat([train_dataset_processed, sample_train])
    X_train = train_data.drop(columns=['Pattern'])
    y_train = train_data.groupby(level=0)['Pattern'].first().to_frame()
    X_test = test_dataset_processed.drop(columns=['Pattern'])
    y_test = test_dataset_processed.groupby(level=0)['Pattern'].first().to_frame()

    mini_rocket_model = train_mini_rcket(X_train,y_train)

    # # get the probabilities of the test data
    # y_test_prob = mini_rocket_model.predict_proba(X_test)
    # y_test_pred = y_test_prob.argmax(axis=1)  
    # test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # # print("Test accuracy for the ",i,"th model: ", test_accuracy)
    
    # # Update the postfix with the previous accuracy
    # progress_bar.set_postfix({"Prev Accuracy": f"{test_accuracy:.4f}" if test_accuracy is not None else "N/A"})

    # get the probabilities of the sample test data
    y_sample_test_prob = mini_rocket_model.predict_proba(sample_test)

    for j in range(0, len(y_sample_test_prob)):
        correct_instance = test_inst_mapping_dict[j]
        # print(y_sample_test_prob[j][7])
        oob_no_pattern_df.loc[correct_instance, 'oob_count'] += 1
        oob_no_pattern_df.loc[correct_instance, 'oob_sum'] += y_sample_test_prob[j][7]
oob_no_pattern_df

In [None]:
# remove the row print constrain in pandas 
pd.reset_option('display.max_rows')
oob_no_pattern_df


In [None]:
# save oob_no_pattern_df to a csv file
oob_no_pattern_df.to_csv('Datasets/VanilaDataset/oob_no_pattern_stats_df.csv')
# save the dictionary to a csv 
instance_index_mapping_df = pd.DataFrame(instance_index_mapping.items(), columns=['Instance', 'Index'])
instance_index_mapping_df.to_csv('Datasets/VanilaDataset/instance_index_mapping_df.csv')

In [None]:
# save the dictionary to a csv 
instance_index_mapping_df = pd.DataFrame(instance_index_mapping.items(), columns=['Instance', 'Index'])
instance_index_mapping_df.to_csv('Datasets/VanilaDataset/instance_index_mapping_df.csv')

In [None]:
# load csv dataset
oob_no_pattern_df = pd.read_csv('Datasets/VanilaDataset/oob_no_pattern_stats_df.csv', index_col=0)
oob_no_pattern_df["Avg_Prob"] = oob_no_pattern_df['oob_sum'] / oob_no_pattern_df['oob_count']
oob_no_pattern_df

In [None]:
# sort by the average probability column
oob_no_pattern_df = oob_no_pattern_df.sort_values(by='Avg_Prob', ascending=False)
oob_no_pattern_df

In [None]:
import matplotlib.pyplot as plt

# Sort the DataFrame by 'Avg_Prob' in descending order
sorted_df = oob_no_pattern_df.sort_values(by='Avg_Prob', ascending=False)

# sorted_df equals the data where the 'Avg_Prob' is higher than 0.5
top_instances = sorted_df.head(200)

# Plot horizontal bar chart
plt.figure(figsize=(10, 6))
plt.barh(top_instances.index.astype(str), top_instances['Avg_Prob'], color='skyblue')
plt.xlabel('Average Probability')
plt.ylabel('Instance')
plt.title('Sorted Average Probability of No Pattern Instances')

plt.gca().invert_yaxis()  # Invert y-axis to have highest at the top
plt.show()


In [None]:
sorted_df[sorted_df['Avg_Prob'] > 0.08]

In [None]:
# get the top 200 instances with the highest average probability
top_200_instances = sorted_df.head(200)
top_200_instances

In [None]:
import pandas as pd
import mplfinance as mpf

def plot_ohlc_segment(data_segment):
    """
    Plots a segment of OHLC data using mplfinance.

    Parameters:
    - data_segment (pd.DataFrame): A DataFrame containing columns ['Open', 'High', 'Low', 'Close', 'Volume']
    """
    # Ensure the DataFrame index is datetime for mplfinance
    data_segment = data_segment.copy()
    data_segment.index = pd.date_range(start='2024-01-01', periods=len(data_segment), freq='D')

    # Plot the candlestick chart
    mpf.plot(data_segment, type='candle', style='charles',
             volume=True, ylabel='Price', ylabel_lower='Volume',
             title="OHLC Segment", figsize=(10, 6))


# Call the function to plot
plot_ohlc_segment(no_pattern_dataset.loc[1005])


# Creating Train and Test full sets by combining data from no pattern segments


## Run from here incase of test train data change occurs


In [None]:
test_dataset = pd.read_csv('Datasets/VanilaDataset/test_patterns_with_symbols.csv')
train_dataset = pd.read_csv('Datasets/VanilaDataset/train_patterns_with_symbols.csv')

file_path = "Datasets/VanilaDataset"
train_dataset_processed = pd.read_csv(file_path + "/trainDataset_w_aug.csv", index_col=[0, 1])
test_dataset_processed = pd.read_csv(file_path + "/testDataset_w_aug.csv" , index_col=[0, 1])

In [None]:
# load csv dataset
oob_no_pattern_df = pd.read_csv('Datasets/VanilaDataset/oob_no_pattern_stats_df.csv', index_col=0)
oob_no_pattern_df["Avg_Prob"] = oob_no_pattern_df['oob_sum'] / oob_no_pattern_df['oob_count']
sorted_df = oob_no_pattern_df.sort_values(by='Avg_Prob', ascending=False)
top_200_instances = sorted_df.head(200)

no_pattern_df = pd.read_csv('Datasets/VanilaDataset/no_pattern_10000_df.csv', index_col=0)
# Read the CSV file
instance_index_mapping_df = pd.read_csv('Datasets/VanilaDataset/instance_index_mapping_df.csv', index_col=0)
# Convert back to dictionary
instance_index_mapping = dict(zip(instance_index_mapping_df['Instance'], instance_index_mapping_df['Index']))

In [None]:
top_200_instances = top_200_instances.index
top_200_instances

In [None]:
# get random 80% of the top 200 instances for training and 20% for testing
train_instances = np.random.choice(top_200_instances, size=int(0.8 * len(top_200_instances)), replace=False)
test_instances = top_200_instances[~np.isin(top_200_instances, train_instances)]
len(train_instances), len(test_instances)

In [None]:
# get the test and train OHLC data for no pattern instances
train_no_pattern_ohlc = no_pattern_dataset.loc[train_instances]
test_no_pattern_ohlc = no_pattern_dataset.loc[test_instances]

In [None]:
mapped_indices_train = [instance_index_mapping[key] for key in train_instances]  # Convert instances to original indices
train_no_pattern_details = no_pattern_df.loc[mapped_indices_train]
train_no_pattern_details.reset_index(drop=True, inplace=True)

mapped_indices_test = [instance_index_mapping[key] for key in test_instances]  # Convert instances to original indices
test_no_pattern_details = no_pattern_df.loc[mapped_indices_test]
test_no_pattern_details.reset_index(drop=True, inplace=True)

In [None]:
train_no_pattern_details

In [None]:
train_no_pattern_ohlc

### combine in to existing datasets


In [None]:
# concatanate the train_no_pattern_details with the train_dataset
full_train_no_pattern_details = pd.concat([train_dataset, train_no_pattern_details])
full_train_no_pattern_details.reset_index(drop=True, inplace=True)

# concatanate the test_no_pattern_details with the test_dataset
full_test_no_pattern_details = pd.concat([test_dataset, test_no_pattern_details])
full_test_no_pattern_details.reset_index(drop=True, inplace=True)   

In [None]:

def instance_index_reset(ohlc_multi_idx_df , max_instance= 0):
    # Extract Instance and Time indices from sample_train
    instance_idx = ohlc_multi_idx_df.index.get_level_values(0)
    time_idx = ohlc_multi_idx_df.index.get_level_values(1)

    # Create new Instance index starting from max_instance + 1
    new_instance_idx = pd.factorize(instance_idx)[0] + max_instance + 1  # Ensure unique new indices

    # Reconstruct the MultiIndex with updated Instance indices
    ohlc_multi_idx_df.index = pd.MultiIndex.from_arrays([new_instance_idx, time_idx], names=["Instance", "Time"])
    
    return ohlc_multi_idx_df

In [None]:
train_no_pattern_ohlc = instance_index_reset(train_no_pattern_ohlc, max_instance=train_dataset_processed.index.get_level_values(0).max())
test_no_pattern_ohlc = instance_index_reset(test_no_pattern_ohlc, max_instance=test_dataset_processed.index.get_level_values(0).max())

In [None]:
# concatanate the train_no_pattern_ohlc with the train_dataset_processed
full_train_no_pattern_ohlc = pd.concat([train_dataset_processed, train_no_pattern_ohlc])

# concatanate the test_no_pattern_ohlc with the test_dataset_processed
full_test_no_pattern_ohlc = pd.concat([test_dataset_processed, test_no_pattern_ohlc])

In [None]:
# Convert 'Start' and 'End' columns to datetime format
full_train_no_pattern_details['Start'] = pd.to_datetime(full_train_no_pattern_details['Start'], errors='coerce')
full_train_no_pattern_details['End'] = pd.to_datetime(full_train_no_pattern_details['End'], errors='coerce')

# Remove any time component (optional if you only want dates)
full_train_no_pattern_details['Start'] = full_train_no_pattern_details['Start'].dt.date
full_train_no_pattern_details['End'] = full_train_no_pattern_details['End'].dt.date

In [None]:
# Convert 'Start' and 'End' columns to datetime format
full_test_no_pattern_details['Start'] = pd.to_datetime(full_test_no_pattern_details['Start'], errors='coerce')
full_test_no_pattern_details['End'] = pd.to_datetime(full_test_no_pattern_details['End'], errors='coerce')

# Remove any time component (optional if you only want dates)
full_test_no_pattern_details['Start'] = full_test_no_pattern_details['Start'].dt.date
full_test_no_pattern_details['End'] = full_test_no_pattern_details['End'].dt.date

In [None]:
folder_path = "Datasets/VanilaDataset/PU results/"

# create the folder if it does not exist
import os
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# save to a csv file
full_train_no_pattern_details.to_csv(folder_path+'train_PU_no_pattern_details.csv')
full_train_no_pattern_ohlc.to_csv(folder_path+'train_PU_no_pattern_ohlc.csv')

full_test_no_pattern_details.to_csv(folder_path+'test_PU_no_pattern_details.csv')
full_test_no_pattern_ohlc.to_csv(folder_path+'test_PU_no_pattern_ohlc.csv')
