# CCMM - Data Format

In [12]:
# LOADING JSON DATA
import json

file_path = "D:\\Python\\python-works\\ccmm-companies-data-format\\ccmm-company-details.json"
with open(file_path,"r", encoding='utf-8') as file:
    data = json.load(file)

print(len(data))

2465


There are `2465` companies data.
```js
// SAMPLE DATA JSON
{
    name: "",
    link: "",
    SCIAN: "",
    address_info: [],
    main_delegate: [
        {
            name: "",
            url: "",
            position: "",
            social_medias: [""]
        }
    ],
    secondary_delegate: [
        {
            name: "",
            url: "",
            position: "",
            social_medias: [""]
        }
    ]
}

```

In [4]:
# FUNCTION TO RECOGNIZE WEBSITE, ADDRESS, TELEPHONE
# FROM THE address_info LIST
import re

def extract_address_info(address_info):
    result = {
        "Address": "",
        "Telephone": "",
        "Website": ""
    }
    
    for item in address_info:
        if item.startswith('tel:'):
            result['Telephone'] = item.replace('tel:', '')
        elif item.startswith('http://') or item.startswith('https://'):
            result['Website'] = item
        else:
            result['Address'] = item
    
    return result


{'Address': '74 Avenue du Dauphiné Candiac', 'Telephone': '+14504662275', 'Website': ''}


In [19]:
# THIS CODE ONLY TO SEPARATE THE MULTIPLE
# SCIAN's WHICH ARE COMBINED IN SCIAN KEY

scian_set = set()

for com in data:
    if "SCIAN" in com:
        scian_set.add(len(com["SCIAN"]))
    else:
        print(com)
        break;

print(scian_set)


{0, 1, 2, 3}


In [16]:
# CODE TO SPLIT THE SCIAN CODES
import re

def split_scian_codes(scian_string):
    scian_codes = re.split(r'(?<!\d)(\d+) - ', scian_string)
    
    if scian_codes[0] == '':
        scian_codes.pop(0)
    
    scian_codes = [f'{code} - {desc}' for code, desc in zip(scian_codes[0::2], scian_codes[1::2])]
    
    return scian_codes


In [17]:
# THIS CODE ONLY TO COUNT THE MULTIPLE
# SCIAN's WHICH ARE COMBINED IN SCIAN KEY

scian_set = set()

for com in data:
    if "SCIAN" in com :
        scian_set.add(len(split_scian_codes(com["SCIAN"])))
    else:
        print(com)
        break;

print(scian_set)

{1, 2, 3}


In [18]:
# SPLITTING & VERIFYING MODIFICAITON
for com in data:
    if "SCIAN" in com and len(com["SCIAN"]) > 0:
        com["SCIAN"] = split_scian_codes(com["SCIAN"][0])


{'name': 'Accenture', 'link': 'https://www.ccmm.ca/fr/entreprises/accenture/', 'SCIAN': ['54161 - Services de conseils en gestion'], 'address_info': ['5 Place Ville-Marie suite 1520 Montréal', 'tel:+15148474027', 'http://www.accenture.com'], 'main_delegate': [{'name': 'Martine Lapointe', 'url': '#/', 'position': 'Directrice du Bureau de Montréal', 'social_medias': ['mailto:martine.lapointe@accenture.com']}], 'secondary_delegate': [{'name': 'Grace Ayoub', 'url': '/fr/membres/grace-ayoub/', 'position': 'Managing Director', 'social_medias': ['mailto:grace.ayoub@accenture.com']}, {'name': 'Kallia Mansour', 'url': '#/', 'position': 'Integrated Marketing Manager', 'social_medias': ['mailto:kallia.mansour@accenture.com']}, {'name': 'Souad Miloudi', 'url': '/fr/membres/souad-miloudi/', 'position': 'Assistante à la directoin', 'social_medias': ['mailto:souad.miloudi@accenture.com']}, {'name': 'Hadi Skalli', 'url': '/fr/membres/hadi-skalli/', 'position': 'Digital Business Intergrational Senior M

In [32]:
# SEPARATE COMPANY DETAILS
company_data = []

for com  in data:
    
    address_info_results = extract_address_info(com["address_info"])

    company_data.append({
        "Name": com["name"],
        "Webpage": com["link"],
        "Address": address_info_results["Address"],
        "Telephone": address_info_results["Telephone"],
        "Website": address_info_results["Website"],
        "No. Main Delegates": len(com["main_delegate"]),
        "No. Secondary Delegates": len(com["secondary_delegate"]),
        "To. Delegates": len(com["main_delegate"]) + len(com["secondary_delegate"]),
        "SCIAN Code 1": com["SCIAN"][0] if len(com["SCIAN"]) > 0 else '',
        "SCIAN Code 2": com["SCIAN"][1] if len(com["SCIAN"]) > 1 else '',
        "SCIAN Code 3": com["SCIAN"][2] if len(com["SCIAN"]) > 2 else '',
        "main_delegate": com["main_delegate"],
        "secondary_delegate": com["secondary_delegate"]
    })


In [29]:
# CONVERT JSON LIST TO DATAFRAME
# ADD SAVE INTO EXCEL
import pandas as pd

companies_df = pd.json_normalize(company_data)
companies_df.to_excel("company_details.xlsx",sheet_name="Company Detail")

print(companies_df.head())

                                             Name  \
0                                       Accenture   
1                       ALLIED PROPERTIES MGT LTD   
2                    Alstom Transport Canada Inc.   
3  Association de la construction du Québec - ACQ   
4                 Autorité des marchés financiers   

                                             Webpage  \
0      https://www.ccmm.ca/fr/entreprises/accenture/   
1  https://www.ccmm.ca/fr/entreprises/allied-prop...   
2  https://www.ccmm.ca/fr/entreprises/alstom-tran...   
3  https://www.ccmm.ca/fr/entreprises/association...   
4  https://www.ccmm.ca/fr/entreprises/autorite-de...   

                                             Address     Telephone  \
0            5 Place Ville-Marie suite 1520 Montréal  +15148474027   
1  7000-111 Boul Robert-Bourassa Bureau 100 Montréal  +15148689002   
2             1101, rue Parent 18e étage Saint-Bruno  +15147641725   
3             9200, boul. Métropolitain Est Montréal  +151435406

In [61]:
# MAIN DELEGATE
comp_detail_delegate = []

for com in data:

    address_info_results = extract_address_info(com["address_info"])

    entry = {
        "Name": com["name"],
        "Address": address_info_results["Address"],
        "Telephone": address_info_results["Telephone"],
        "Website": address_info_results["Website"],
        "Webpage": com["link"],
        "SCIAN Code 1": com["SCIAN"][0] if len(com["SCIAN"]) > 0 else '',
        "SCIAN Code 2": com["SCIAN"][1] if len(com["SCIAN"]) > 1 else '',
        "SCIAN Code 3": com["SCIAN"][2] if len(com["SCIAN"]) > 2 else '',
        "No. Main Delegates": len(com["main_delegate"]),
        "No. Secondary Delegates": len(com["secondary_delegate"]),
        "To. Delegates": len(com["main_delegate"]) + len(com["secondary_delegate"]),
    } 
    
    i = 1
    
    for md in com['main_delegate']:
            delegate_key = f'MD{i}'
            try:
                entry[f'{delegate_key} Name'] = md['name']
                entry[f'{delegate_key} Position'] = md['position']
                entry[f'{delegate_key} Email'] = next((s for s in md['social_medias'] if s.startswith('mailto:')), '').replace('mailto:', '')
            except IndexError:
                # Fill with empty strings if fewer than i delegates
                entry[f'{delegate_key} Name'] = ''
                entry[f'{delegate_key} Position'] = ''
                entry[f'{delegate_key} Email'] = ''
            i += 1
    
    entry.update(com)
    del entry["main_delegate"]
    del entry["secondary_delegate"]

    comp_detail_delegate.append(entry)

print(comp_detail_delegate[0:5])

[{'Name': 'Accenture', 'Address': '5 Place Ville-Marie suite 1520 Montréal', 'Telephone': '+15148474027', 'Website': 'http://www.accenture.com', 'Webpage': 'https://www.ccmm.ca/fr/entreprises/accenture/', 'SCIAN Code 1': '54161 - Services de conseils en gestion', 'SCIAN Code 2': '', 'SCIAN Code 3': '', 'No. Main Delegates': 1, 'No. Secondary Delegates': 5, 'To. Delegates': 6, 'MD1 Name': 'Martine Lapointe', 'MD1 Position': 'Directrice du Bureau de Montréal', 'MD1 Email': 'martine.lapointe@accenture.com', 'name': 'Accenture', 'link': 'https://www.ccmm.ca/fr/entreprises/accenture/', 'SCIAN': ['54161 - Services de conseils en gestion'], 'address_info': ['5 Place Ville-Marie suite 1520 Montréal', 'tel:+15148474027', 'http://www.accenture.com']}, {'Name': 'ALLIED PROPERTIES MGT LTD', 'Address': '7000-111 Boul Robert-Bourassa Bureau 100 Montréal', 'Telephone': '+15148689002', 'Website': 'http://www.alliedreit.com', 'Webpage': 'https://www.ccmm.ca/fr/entreprises/allied-properties-mgt-ltd/', '

In [64]:
# CONVERTING DATA WITH MD
import pandas as pd

comp_md_data = pd.json_normalize(comp_detail_delegate)
comp_md_data.fillna('', inplace=True)
comp_md_data.to_excel("company_md_details.xlsx")

print(comp_md_data.head())

                                             Name  \
0                                       Accenture   
1                       ALLIED PROPERTIES MGT LTD   
2                    Alstom Transport Canada Inc.   
3  Association de la construction du Québec - ACQ   
4                 Autorité des marchés financiers   

                                             Address     Telephone  \
0            5 Place Ville-Marie suite 1520 Montréal  +15148474027   
1  7000-111 Boul Robert-Bourassa Bureau 100 Montréal  +15148689002   
2             1101, rue Parent 18e étage Saint-Bruno  +15147641725   
3             9200, boul. Métropolitain Est Montréal  +15143540609   
4                2640, boul. Laurier 8e étage Québec  +14185250337   

                                      Website  \
0                    http://www.accenture.com   
1                   http://www.alliedreit.com   
2  https://www.alstom.com/fr/alstom-au-canada   
3                        https://www.acq.org/   
4        https:

In [65]:
import urllib.parse

social_media = set()
for com in data:
    # Iterate over main delegates
    for md in com["main_delegate"]:
        for link in md["social_medias"]:
            if link.startswith('mailto:'):
                social_media.add("email")
            else:
                # Extract the domain name from the URL and add to the set
                domain = urllib.parse.urlparse(link).netloc
                social_media.add(domain)

    # Iterate over secondary delegates
    for sd in com["secondary_delegate"]:
        for link in sd["social_medias"]:
            if link.startswith('mailto:'):
                social_media.add("email")
            else:
                # Extract the domain name from the URL and add to the set
                domain = urllib.parse.urlparse(link).netloc
                social_media.add(domain)

# Print out the unique social media types found
print(social_media)


{'www.linkedin.com', 'www.facebook.com', 'email'}


In [71]:
main_delegate_data = []

for com in data:
    for md in com["main_delegate"]:
        entry = {
            "Company Name": com["name"],
            "Name": md['name'],
            "Position": md['position'],
            "Email": "",
            "Facebook": "",
            "Linkedin": ""
        }
        
        # Loop through social media links and update entry dictionary accordingly
        for link in md["social_medias"]:
            if link.startswith('mailto:'):
                entry['Email'] = link.replace("mailto:", '').lower()
            elif 'facebook' in link:
                entry["Facebook"] = link.lower()
            elif 'linkedin' in link:
                entry["Linkedin"] = link.lower()
        
        main_delegate_data.append(entry)

# Outputting processed data
print(main_delegate_data[:5])

[{'Company Name': 'Accenture', 'Name': 'Martine Lapointe', 'Position': 'Directrice du Bureau de Montréal', 'Email': 'martine.lapointe@accenture.com', 'Facebook': '', 'Linkedin': ''}, {'Company Name': 'ALLIED PROPERTIES MGT LTD', 'Name': 'Jean-François Burdet', 'Position': 'Vice-président régional, Est du Canada', 'Email': 'jfburdet@alliedreit.com', 'Facebook': '', 'Linkedin': ''}, {'Company Name': 'Alstom Transport Canada Inc.', 'Name': 'Olivier Marcil', 'Position': 'Vice-président affaires publiques Canada', 'Email': 'olivier.marcil@alstomgroup.com', 'Facebook': '', 'Linkedin': ''}, {'Company Name': 'Association de la construction du Québec - ACQ', 'Name': 'Felix Rhéaume', 'Position': 'Directeur aux affaires publiques et gouvernementales', 'Email': 'rheaumef@acq.org', 'Facebook': '', 'Linkedin': ''}, {'Company Name': 'Autorité des marchés financiers', 'Name': 'Yves Ouellet', 'Position': 'Président Directeur Général', 'Email': 'yves.ouellet@lautorite.qc.ca', 'Facebook': '', 'Linkedin':

In [72]:
import pandas as pd

main_delegate_sheet = pd.json_normalize(main_delegate_data)

print(main_delegate_sheet.head())

                                     Company Name                  Name  \
0                                       Accenture      Martine Lapointe   
1                       ALLIED PROPERTIES MGT LTD  Jean-François Burdet   
2                    Alstom Transport Canada Inc.        Olivier Marcil   
3  Association de la construction du Québec - ACQ         Felix Rhéaume   
4                 Autorité des marchés financiers          Yves Ouellet   

                                            Position  \
0                   Directrice du Bureau de Montréal   
1             Vice-président régional, Est du Canada   
2           Vice-président affaires publiques Canada   
3  Directeur aux affaires publiques et gouverneme...   
4                        Président Directeur Général   

                            Email Facebook Linkedin  
0  martine.lapointe@accenture.com                    
1         jfburdet@alliedreit.com                    
2  olivier.marcil@alstomgroup.com                 

In [73]:
secondary_delegate_data = []

for com in data:
    for md in com["secondary_delegate"]:
        entry = {
            "Company Name": com["name"],
            "Name": md['name'],
            "Position": md['position'],
            "Email": "",
            "Facebook": "",
            "Linkedin": ""
        }
        
        # Loop through social media links and update entry dictionary accordingly
        for link in md["social_medias"]:
            if link.startswith('mailto:'):
                entry['Email'] = link.replace("mailto:", '').lower()
            elif 'facebook' in link:
                entry["Facebook"] = link.lower()
            elif 'linkedin' in link:
                entry["Linkedin"] = link.lower()
        
        secondary_delegate_data.append(entry)

# Outputting processed data
print(secondary_delegate_data[:5])

[{'Company Name': 'Accenture', 'Name': 'Grace Ayoub', 'Position': 'Managing Director', 'Email': 'grace.ayoub@accenture.com', 'Facebook': '', 'Linkedin': ''}, {'Company Name': 'Accenture', 'Name': 'Kallia Mansour', 'Position': 'Integrated Marketing Manager', 'Email': 'kallia.mansour@accenture.com', 'Facebook': '', 'Linkedin': ''}, {'Company Name': 'Accenture', 'Name': 'Souad Miloudi', 'Position': 'Assistante à la directoin', 'Email': 'souad.miloudi@accenture.com', 'Facebook': '', 'Linkedin': ''}, {'Company Name': 'Accenture', 'Name': 'Hadi Skalli', 'Position': 'Digital Business Intergrational Senior Manager', 'Email': 'hadi.skalli@accenture.com', 'Facebook': '', 'Linkedin': ''}, {'Company Name': 'Accenture', 'Name': 'François Ste-Marie', 'Position': 'Montreal Market Lead', 'Email': 'francois.ste-marie@accenture.com', 'Facebook': '', 'Linkedin': ''}]


In [74]:
import pandas as pd

secondary_delegate_sheet = pd.json_normalize(secondary_delegate_data)

print(secondary_delegate_sheet.head())

  Company Name                Name  \
0    Accenture         Grace Ayoub   
1    Accenture      Kallia Mansour   
2    Accenture       Souad Miloudi   
3    Accenture         Hadi Skalli   
4    Accenture  François Ste-Marie   

                                         Position  \
0                               Managing Director   
1                    Integrated Marketing Manager   
2                       Assistante à la directoin   
3  Digital Business Intergrational Senior Manager   
4                            Montreal Market Lead   

                              Email Facebook Linkedin  
0         grace.ayoub@accenture.com                    
1      kallia.mansour@accenture.com                    
2       souad.miloudi@accenture.com                    
3         hadi.skalli@accenture.com                    
4  francois.ste-marie@accenture.com                    


In [75]:
# MAIN DELEGATE
comp_detail_data = []

for com in data:

    address_info_results = extract_address_info(com["address_info"])

    entry = {
        "Name": com["name"],
        "Address": address_info_results["Address"],
        "Telephone": address_info_results["Telephone"],
        "Website": address_info_results["Website"],
        "Webpage": com["link"],
        "SCIAN Code 1": com["SCIAN"][0] if len(com["SCIAN"]) > 0 else '',
        "SCIAN Code 2": com["SCIAN"][1] if len(com["SCIAN"]) > 1 else '',
        "SCIAN Code 3": com["SCIAN"][2] if len(com["SCIAN"]) > 2 else '',
        "No. Main Delegates": len(com["main_delegate"]),
        "No. Secondary Delegates": len(com["secondary_delegate"]),
        "To. Delegates": len(com["main_delegate"]) + len(com["secondary_delegate"]),
    } 

    comp_detail_data.append(entry)

print(comp_detail_data[0:5])

[{'Name': 'Accenture', 'Address': '5 Place Ville-Marie suite 1520 Montréal', 'Telephone': '+15148474027', 'Website': 'http://www.accenture.com', 'Webpage': 'https://www.ccmm.ca/fr/entreprises/accenture/', 'SCIAN Code 1': '54161 - Services de conseils en gestion', 'SCIAN Code 2': '', 'SCIAN Code 3': '', 'No. Main Delegates': 1, 'No. Secondary Delegates': 5, 'To. Delegates': 6}, {'Name': 'ALLIED PROPERTIES MGT LTD', 'Address': '7000-111 Boul Robert-Bourassa Bureau 100 Montréal', 'Telephone': '+15148689002', 'Website': 'http://www.alliedreit.com', 'Webpage': 'https://www.ccmm.ca/fr/entreprises/allied-properties-mgt-ltd/', 'SCIAN Code 1': "52311 - Services bancaires d'investissement et commerce des valeurs mobilières", 'SCIAN Code 2': '', 'SCIAN Code 3': '', 'No. Main Delegates': 1, 'No. Secondary Delegates': 5, 'To. Delegates': 6}, {'Name': 'Alstom Transport Canada Inc.', 'Address': '1101, rue Parent 18e étage Saint-Bruno', 'Telephone': '+15147641725', 'Website': 'https://www.alstom.com/f

In [76]:
import pandas as pd

comp_info_sheet = pd.json_normalize(comp_detail_data)

In [77]:
# Specify the Excel file path
file_path = 'ccmm_company_details.xlsx'

# Using ExcelWriter to write to multiple sheets
with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
    comp_info_sheet.to_excel(writer, sheet_name='Company Info', index=False)
    main_delegate_sheet.to_excel(writer, sheet_name='Main Delegates', index=False)
    secondary_delegate_sheet.to_excel(writer, sheet_name='Secondary Delegates', index=False)