In [112]:
import numpy as np
def output(instring):
    print(instring)

### Cleaning and Preparation

In [78]:
#Preaparation
input_file = 'stocks_sales_dataset.csv'
output_file = 'cleaned_data.csv'

with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
    #header = next(f_in)
    #print("Header:", header.strip())
    for line in f_in:
        # Split by comma to count columns
        columns = line.strip().split(',')
        # Check if line has exactly 10 columns
        if len(columns) == 10:
            f_out.write(line)

### CSV to NP ARRAY

In [79]:
#Loading

data = np.genfromtxt(
    'cleaned_data.csv',           
    delimiter=",",                
    dtype=None,                   
    encoding='utf-8',             
    names=None,                   
    missing_values='',            
    filling_values=np.nan, 
    invalid_raise = False #To ignore rows with inconsistant column count
)

In [80]:
print(data.shape)
data = data[1:]
print(data.shape)

(1001, 10)
(1000, 10)


In [81]:
type(data)

numpy.ndarray

### Filling Empty values 

In [85]:
data[0:5,:]

array([['1', 'Stock_1', '2024-03-17', '403.21', '186.7', '143175',
        '650540204538.62', '16.64', '0.81', 'Healthcare'],
       ['2', 'Stock_2', '2024-02-11', '395.96', '377.7', '201937',
        '755740452782.7', '44.14', '2.34', 'Healthcare'],
       ['3', 'Stock_3', '2024-09-21', '244.89', '150.91', '493048',
        '90325484303.82', '16.04', '4.98', 'Technology'],
       ['4', 'Stock_4', '2024-10-12', '197.23', '342.65', '723463',
        '389317541468.42', '35.6', '2.68', 'Healthcare'],
       ['5', 'Stock_5', '2024-02-10', '397.5', '439.9', '856814',
        '986642362836.08', '44.91', '1.49', 'Finance']], dtype='<U15')

In [88]:
data[9,:]

array(['10', 'Stock_10', '2024-02-02', '397.38', '', '208593',
       '652649542521.17', '41.42', '2.35', 'Technology'], dtype='<U15')

In [96]:
data[data == ''] = np.nan

# Check if a column is numeric
def is_numeric(column):
    try:
        column.astype(float)  
        return True
    except ValueError:
        return False

for col in range(data.shape[1]):
    column = data[:, col]
    if is_numeric(column):
        # Convert column to float 
        numeric_col = column.astype(float)
        
        # Calculate the mean excluding NaNs
        mean_value = np.nanmean(numeric_col)
        
        # Fill NaNs with the column mean
        data[:, col] = np.where(np.isnan(numeric_col), mean_value, numeric_col).astype(str)
    else:
        # Fill non-numeric columns with 'unknown' where values are NaN
        data[:, col] = np.where(column == 'nan', 'unknown', column)

# Output the processed data
print(data)


[['1.0' 'Stock_1' '2024-03-17' ... '16.64' '0.81' 'Healthcare']
 ['2.0' 'Stock_2' '2024-02-11' ... '44.14' '2.34' 'Healthcare']
 ['3.0' 'Stock_3' '2024-09-21' ... '16.04' '4.98' 'Technology']
 ...
 ['998.0' 'Stock_998' '2024-01-15' ... '16.75' '2.39' 'Technology']
 ['999.0' 'Stock_999' '2024-01-23' ... '31.24' '2.09' 'Consumer Goods']
 ['1000.0' 'Stock_1000' '2023-12-21' ... '11.34' '0.77' 'Finance']]


In [97]:
data[9,:]

array(['10.0', 'Stock_10', '2024-02-02', '397.38', '299.34062124248',
       '208593.0', '652649542521.17', '41.42', '2.35', 'Technology'],
      dtype='<U15')

###  Mean, standard deviation, min, and max for opening prices by sector 

In [104]:
sector_Wise_Calculation_openning = np.empty((0, 2), dtype=object)  # Empty array

for i in range(1, len(data[:, 9])):
    if data[i, 9] in sector_Wise_Calculation_openning[:, 0]:
        row_index = np.where(sector_Wise_Calculation_openning[:, 0] == data[i, 9])[0][0]
        sector_Wise_Calculation_openning[row_index, 1] = np.append(sector_Wise_Calculation_openning[row_index, 1], np.array([data[i, 3]]))
    else:
        sector_Wise_Calculation_openning = np.append(sector_Wise_Calculation_openning, [[data[i, 9], np.array([data[i, 3]], dtype=object)]], axis=0)
        


In [116]:
output("OPENNING PRICES")
for j in range(0,len(sector_Wise_Calculation_openning[0:,0])):
    print("Sector:",sector_Wise_Calculation_openning[j,0])
    print("Mean:",np.nanmean(sector_Wise_Calculation_openning[j, 1].astype(float)))
    print("std:",np.std(sector_Wise_Calculation_openning[j, 1].astype(float)))
    print("Min:",np.min(sector_Wise_Calculation_openning[j, 1].astype(float)))
    print("Max:",np.max(sector_Wise_Calculation_openning[j, 1].astype(float)))


OPENNING PRICES
Sector: Healthcare
Mean: 289.93319371727745
std: 116.87800724135899
Min: 100.42
Max: 496.04
Sector: Technology
Mean: 303.34781094527364
std: 112.79153665712998
Min: 100.66
Max: 499.51
Sector: Finance
Mean: 297.81723756906075
std: 114.89150096756035
Min: 103.71
Max: 499.56
Sector: Consumer Goods
Mean: 300.71990074980636
std: 120.84474310542984
Min: 100.35
Max: 499.3
Sector: Energy
Mean: 293.9628971962617
std: 114.6564033451898
Min: 100.45
Max: 499.63


###  Mean, standard deviation, min, and max for Closing prices by sector 

In [106]:
sector_Wise_Calculation_closing = np.empty((0, 2), dtype=object)  # Empty array

for i in range(1, len(data[:, 9])):
    if data[i, 9] in sector_Wise_Calculation_closing[:, 0]:
        row_index = np.where(sector_Wise_Calculation_closing[:, 0] == data[i, 9])[0][0]
        sector_Wise_Calculation_closing[row_index, 1] = np.append(sector_Wise_Calculation_closing[row_index, 1], np.array([data[i, 4]]))
    else:
        sector_Wise_Calculation_closing = np.append(sector_Wise_Calculation_closing, [[data[i, 9], np.array([data[i, 4]], dtype=object)]], axis=0)
        


In [113]:
output("CLOSING PRICES")
for j in range(0,len(sector_Wise_Calculation_closing[0:,0])):
    print("Sector:",sector_Wise_Calculation_closing[j,0])
    print("Mean:",np.nanmean(sector_Wise_Calculation_closing[j, 1].astype(float)))
    print("std:",np.std(sector_Wise_Calculation_closing[j, 1].astype(float)))
    print("Min:",np.min(sector_Wise_Calculation_closing[j, 1].astype(float)))
    print("Max:",np.max(sector_Wise_Calculation_closing[j, 1].astype(float)))


CLOSING PRICES
Sector: Healthcare
Mean: 305.68335078534034
std: 109.5139352641685
Min: 113.41
Max: 498.86
Sector: Technology
Mean: 290.5304539427113
std: 113.6927360196837
Min: 100.98
Max: 488.35
Sector: Finance
Mean: 298.44082872928175
std: 115.07589199829263
Min: 109.2
Max: 494.89
Sector: Consumer Goods
Mean: 305.13754716981134
std: 115.12264778750645
Min: 100.43
Max: 492.5
Sector: Energy
Mean: 297.49920560747665
std: 115.552031953667
Min: 105.11
Max: 498.9


In [120]:
output("COMPARISION PRICES")
for k in range(0,len(sector_Wise_Calculation_closing[0:,0])):
    print("Sector:",sector_Wise_Calculation_closing[k,0])
    print("Openning Price Mean:",np.nanmean(sector_Wise_Calculation_openning[k, 1].astype(float)))
    print("Closing Price Mean:",np.nanmean(sector_Wise_Calculation_closing[k, 1].astype(float)))
    print("")

COMPARISION PRICES
Sector: Healthcare
Openning Price Mean: 289.93319371727745
Closing Price Mean: 305.68335078534034

Sector: Technology
Openning Price Mean: 303.34781094527364
Closing Price Mean: 290.5304539427113

Sector: Finance
Openning Price Mean: 297.81723756906075
Closing Price Mean: 298.44082872928175

Sector: Consumer Goods
Openning Price Mean: 300.71990074980636
Closing Price Mean: 305.13754716981134

Sector: Energy
Openning Price Mean: 293.9628971962617
Closing Price Mean: 297.49920560747665



### Correlation Analysis of Opening and Closing Prices

In [124]:
def classify_correlation(correlation):
    """
    Classifies the correlation based on the Pearson correlation coefficient value.

    Parameters:
    correlation (float): The Pearson correlation coefficient between two variables.

    Returns:
    str: A string that classifies the correlation as very weak, weak, strong, etc.
    """

    if correlation > 0.9:
        return "Very Strong Positive Correlation"
    elif correlation > 0.7:
        return "Strong Positive Correlation"
    elif correlation > 0.4:
        return "Moderate Positive Correlation"
    elif correlation > 0.1:
        return "Weak Positive Correlation"
    elif correlation > -0.1:
        # Very weak correlation, distinguishing positive or negative
        if correlation > 0:
            return "Very Weak Positive Correlation"
        else:
            return "Very Weak Negative Correlation"
    elif correlation > -0.4:
        return "Weak Negative Correlation"
    elif correlation > -0.7:
        return "Moderate Negative Correlation"
    elif correlation > -0.9:
        return "Strong Negative Correlation"
    else:
        return "Very Strong Negative Correlation"

In [127]:
opening_prices = data[:, 3].astype(float)
closing_prices = data[:, 4].astype(float)

# Calculate the Pearson correlation coefficient between opening and closing prices
correlation_matrix = np.corrcoef(opening_prices, closing_prices)

correlation = correlation_matrix[0, 1]

# Print the result
print(f"Pearson correlation between opening and closing prices: {correlation}")
print("->",classify_correlation(correlation))

Pearson correlation between opening and closing prices: -0.03121887661554187
-> Very Weak Negative Correlation


### Sector Analysis of Market Capitalization

In [None]:
sectors = data[:, 9]
market_caps = data[:, 6].astype(float)
unique_sectors = np.unique(sectors)

sector_avg_market_cap = {}

for sector in unique_sectors:
    # Find indices where the sector matches
    sector_indices = np.where(sectors == sector)[0]
    
    # Extract the market capitalizations for the matching sector
    sector_market_caps = market_caps[sector_indices]
    
    # Calculate the mean of market caps for this sector
    sector_avg_market_cap[sector] = np.mean(sector_market_caps)

# Print the results
for sector, avg_market_cap in sector_avg_market_cap.items():
    print(f"Sector: {sector} | Average Market Cap: {avg_market_cap:.2f}")