<span style="color:red;">Make for each word its non-standardized feature</span>

In [1]:
!pip install pyenchant



In [2]:
import pandas as pd
import numpy as np
import enchant

from scipy.stats import spearmanr

<h1>Features</h1>

In [3]:
# define the name of the column with names
cname = 'Name'

In [4]:
def average_amount_of_letters(df):
    
    names = df[cname].to_list()
    container = []
    
    for name in names:
        name = name.lower()
        group = name.split()
        amount_of_words = len(group)
        
        amount_of_letters = 0
        for word in group:
            for letter in word:
                if letter.isalpha():
                    amount_of_letters = amount_of_letters + 1
        
        average_amount_of_letters = amount_of_letters / amount_of_words
        container.append(average_amount_of_letters)
    
    return pd.DataFrame(container, columns=['average amount of letters'])

In [1]:
def average_amount_of_syllables(df): 
    
    names = df[cname].to_list()
    container = []
    
    for name in names:
        name = name.lower()
        group = name.split()
        amount_of_words = len(group)
        
        total_amount_of_syllables = 0
        for word in group:
            
            amount_of_syllables = 0
            vowels = "aeiouy"
            
            if word[0] in vowels:
                amount_of_syllables = amount_of_syllables + 1
            for index in range(1, len(word)):
                if word[index] in vowels and word[index - 1] not in vowels:
                    amount_of_syllables = amount_of_syllables + 1
            if word.endswith("e"):
                amount_of_syllables = amount_of_syllables - 1
            if amount_of_syllables == 0:
                amount_of_syllables = amount_of_syllables + 1
    
            total_amount_of_syllables = total_amount_of_syllables + amount_of_syllables
        
        average_amount_of_syllables = total_amount_of_syllables / amount_of_words
        container.append(average_amount_of_syllables)
    
    return pd.DataFrame(container, columns=['average amount of syllables'])
    

In [6]:
def alphabetical_order(df):
    
    names = df[cname].to_list()
    
    abcde_container = []
    middle_container = []
    vwxyz_container = []
    
    for name in names:
        first_letter = name.lower()[0]
        
        if first_letter in 'abcde':
            abcde_container.append(1)
            middle_container.append(0)
            vwxyz_container.append(0)
        elif first_letter in 'vwxyz':
            abcde_container.append(0)
            middle_container.append(0)
            vwxyz_container.append(1)
        else:
            abcde_container.append(0)
            middle_container.append(1)
            vwxyz_container.append(0)
            
    abcde_df = pd.DataFrame(abcde_container, columns=['abcde'])
    middle_df = pd.DataFrame(middle_container, columns=['middle'])
    vwxyz_df = pd.DataFrame(vwxyz_container, columns=['vwxyz'])
    
    merged = pd.concat([abcde_df, middle_df, vwxyz_df], axis=1)
    return merged

In [7]:
def vowel_to_length(df):
    
    names = df[cname].to_list()
    container = []
    
    for name in names:
        name = name.lower()
        group = name.split()
        amount_of_vowels = 0
        total_length = 0
        
        for word in group:
            for letter in word:
                if letter.isalpha():
                    total_length = total_length + 1
                if letter in 'aeiouy':
                    amount_of_vowels = amount_of_vowels + 1
        
        vowel_to_length = amount_of_vowels / total_length
        container.append(vowel_to_length)
        
    return pd.DataFrame(container, columns=['vowel to length ratio'])

In [8]:
def most_repetitive_letter(df):
    
    names = df[cname].to_list()
    container = []
    
    for name in names:
        name = name.lower()
    
        dictionairy = {}
        for letter in name:
            if letter.isalpha():
                if letter in dictionairy:
                    dictionairy[letter] = dictionairy[letter] + 1
                else:
                    dictionairy[letter] = 1

        # get the amount of most repetitive letter
        max_count = 0
        for count in dictionairy.values():
            if count > max_count:
                max_count = count

        container.append(max_count)
        
    return pd.DataFrame(container, columns=['most repetitions'])

In [9]:
def spell_checker(df):
    
    names = df[cname].to_list()
    container = []
    spell = enchant.Dict("en_US")
    spell = enchant.Dict("en_GB")
    
    for name in names:
        #name = name.lower()                             
        group = name.split()
        #amount_of_words = len(group)
        amount_of_errors = 0
        
        for word in group:
            if spell.check(word.strip()):
                pass
            else:
                amount_of_errors = amount_of_errors + 1
                
        if amount_of_errors > 0:
            flag = 1
        else:
            flag = 0
        container.append(flag)
        
    return pd.DataFrame(container, columns=['word has error'])

<br>
<br>
<br>

In [10]:
def assembler(df):
    
    companies = df[['Symbol', 'Name']]                                           
    
    return pd.concat([companies,
                      average_amount_of_letters(df),
                      average_amount_of_syllables(df),
                      alphabetical_order(df),
                      vowel_to_length(df),
                      most_repetitive_letter(df),
                      spell_checker(df)], axis=1)

<h1>Make the Features and store them</h1>

In [13]:
df = pd.read_excel('data_lemma_internet_cleaned.xlsx', sheet_name='sheet2')             
df 

Unnamed: 0,Symbol,Name
0,8878596,the
1,5372682,be
2,4658360,of
3,4270734,and
4,4234482,to
...,...,...
19025,218,marquis
19026,218,handicraft
19027,218,bravely
19028,218,harrisburg


In [15]:
assembler(df).to_excel('z_features.xlsx')

<h1>Spearman's rho check</h1>

In [26]:
symbol = df['Symbol'].to_list()
symbol

['A',
 'AAIC',
 'AAP',
 'AAT',
 'AB',
 'ABB',
 'ABBV',
 'ABC',
 'ABG',
 'ABM',
 'ABR',
 'ABT',
 'ACCO',
 'ACM',
 'ACN',
 'ACR',
 'ACRE',
 'ADC',
 'ADM',
 'AEE',
 'AEG',
 'AEL',
 'AEM',
 'AEO',
 'AER',
 'AES',
 'AFG',
 'AFL',
 'AG',
 'AGCO',
 'AGM',
 'AGO',
 'AGRO',
 'AGX',
 'AHT',
 'AIG',
 'AIN',
 'AIR',
 'AIT',
 'AIV',
 'AIZ',
 'AJG',
 'AJRD',
 'AKO.A',
 'AKR',
 'AL',
 'ALB',
 'ALE',
 'ALEX',
 'ALG',
 'ALK',
 'ALL',
 'ALSN',
 'ALV',
 'ALX',
 'AME',
 'AMG',
 'AMN',
 'AMP',
 'AMRC',
 'AMT',
 'AN',
 'ANF',
 'AON',
 'AORT',
 'AOS',
 'AP',
 'APD',
 'APH',
 'APO',
 'APTV',
 'ARC',
 'ARCO',
 'ARE',
 'ARGO',
 'ARI',
 'ARL',
 'AROC',
 'ARR',
 'ARW',
 'ASA',
 'ASB',
 'ASGN',
 'ASH',
 'ASR',
 'ATGE',
 'ATI',
 'ATO',
 'ATR',
 'AU',
 'AUB',
 'AUD',
 'AUY',
 'AVA',
 'AVB',
 'AVD',
 'AVNT',
 'AVY',
 'AWI',
 'AWK',
 'AWR',
 'AXL',
 'AXP',
 'AXR',
 'AXS',
 'AYI',
 'AZO',
 'AZZ',
 'B',
 'BA',
 'BAC',
 'BAH',
 'BAK',
 'BALL',
 'BANC',
 'BAP',
 'BAX',
 'BB',
 'BBAR',
 'BBD',
 'BBDC',
 'BBVA',
 'BBW',
 'B

In [27]:
name = df['Name'].to_list()
name

['Agilent Technologies',
 'Arlington Asset Investment ',
 'Advance Auto Parts ',
 'American Assets Trust ',
 'AllianceBernstein Holding',
 'ABB ',
 'AbbVie ',
 'AmerisourceBergen ',
 'Asbury Automotive Group ',
 'ABM Industries  ',
 'Arbor Realty Trust ',
 'Abbott Laboratories ',
 'Acco Brands ',
 'AECOM ',
 'Accenture ',
 'ACRES Commercial Realty  ',
 'Ares Commercial Real Estate',
 'Agree Realty ',
 'Archer Daniels Midland Company ',
 'Ameren',
 'AEGON  ',
 'American Equity Investment Life Holding Company ',
 'Agnico Eagle Mines ',
 'American Eagle Outfitters ',
 'AerCap Holdings ',
 'The AES Corporation',
 'American Financial Group ',
 'AFLAC  ',
 'First Majestic Silver',
 'AGCO ',
 'Federal Agricultural Mortgage  ',
 'Assured Guaranty ',
 'Adecoagro ',
 'Argan ',
 'Ashford Hospitality Trust ',
 'American International Group ',
 'Albany International  ',
 'AAR ',
 'Applied Industrial Technologies ',
 'Apartment Investment and Management Company ',
 'Assurant ',
 'Arthur Gallagher Co

In [28]:
# Define two lists of strings
list1 = symbol
list2 = name

# Convert each string to a numerical rank
rank1 = [sorted(list1).index(x) for x in list1]
rank2 = [sorted(list2).index(x) for x in list2]

# Calculate Spearman's rho between the two ranks
rho, pval = spearmanr(rank1, rank2)

print("Spearman's rho:", rho)
print("p-value:", pval)

Spearman's rho: 0.901514292717886
p-value: 0.0
