## Import the required libraries

In [1]:
#!pip install gensim

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
import jieba
import re

# Download the required NLTK resources
nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

  re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
  re_skip_default = re.compile("(\r\n|\s)", re.U)
  re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## Load the dataset from the file (changes needed)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!cd /content/drive/MyDrive/Capstone_2025

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Capstone_2025/Copy of FIEC_Basic.csv')

  df = pd.read_csv('/content/drive/MyDrive/Capstone_2025/Copy of FIEC_Basic.csv')


In [6]:
df.head()

Unnamed: 0,companyname,enterpriseID,startdate,industry,regcap,regcap_currency,businessoperations,address,city,jv_broad,lastreport,exit_reported,coo,gb_munic,prov,uid,regcap_10kUSD,investorID
0,捷益尔（上海）机器人有限公司,91310115MA1K4RG98K,2021-05-20,其他机械设备及电子产品批发,300.0,万元人民币,一般项目：机器人、机械设备、五金产品、电子产品、计算机软硬件及辅助设备、仪器仪表及上述商品零...,中国（上海）自由贸易试验区金科路2889弄3号6层625室,上海市,0.0,2023.0,0,日本,310000.0,31.0,1000005000.0,41.754002,966065664
1,苏州市力克设备制造有限公司,91320594MACN6R8H6R,2023-06-13,缝制机械制造,50.0,万欧元,一般项目：缝制机械制造；皮革、毛皮及其制品加工专用设备制造；日用品生产专用设备制造；纺织专用...,苏州工业园区润胜路1号6号厂房一层和二层西南角,苏州市,0.0,2023.0,0,法国,320500.0,32.0,1000025000.0,57.594501,278478016
2,宁波樱迅机械化工有限公司,91330211734248216P,2002-01-04,其他金属加工机械制造,32.0,万美元,机械配件制造、加工；医药中间体（除化学危险品）的制造、加工；船舶配件及设备、船用仪器仪表、纺...,宁波市镇海蟹浦工业开发区,宁波市,1.0,2023.0,0,新加坡,330200.0,33.0,1000048000.0,32.0,814442432|253445184
3,上海浦东尤尼恩皮革制品有限公司,607293890,1993-12-31,皮鞋制造,93.0,万美元,设计、生产皮鞋、皮鞋部件、皮带类皮革制品、配套的五金件，橡胶材质的劳防手套、鞋、充气玩具、积...,上海市浦东新区合庆镇向东村三队陆家宅23号,上海市,1.0,2017.0,1,日本,310000.0,31.0,1000067000.0,93.0,121008896|543976192
4,亚太（天津）商业保理有限公司,91120118MA068X1F5H,2017-11-28,其他未包括金融业,5000.0,万元人民币,以受让应收账款的方式提供贸易融资；应收账款的收付结算、管理与催收；销售分户（分类）账管理；客...,天津自贸试验区（东疆保税港区）亚洲路6865号金融贸易中心北区1-1-2005-11,天津市,1.0,2022.0,1,香港,120000.0,12.0,1000077000.0,695.90002,756254336|701767936


In [7]:
print(df.isnull().sum())

companyname             6998
enterpriseID            3974
startdate               4062
industry                3965
regcap                  5102
regcap_currency         3967
businessoperations      7268
address                58633
city                   59145
jv_broad               58411
lastreport             68674
exit_reported          58466
coo                   148910
gb_munic              113231
prov                  113231
uid                   112857
regcap_10kUSD         112857
investorID            114086
dtype: int64


## Creating the Dataframe with company name and bussiness operations

In [8]:
company_operations_df = df[['companyname', 'businessoperations']].copy()

## Data cleaning

In [9]:

# Function to clean text data
def clean_text(text):
    if isinstance(text, str):
        # Remove line breaks and tabs
        text = text.replace('\n', '').replace('\t', '')

        legal_phrases = [
         '一般项目',
         '许可项目',
         '除依法须经批准的项目外，凭营业执照依法自主开展经营活动',
         '依法须经批准的项目，经相关部门批准后方可开展经营活动',
         '具体经营项目以相关部门批准文件或许可证件为准'
       ]
        for phrase in legal_phrases:
           text = text.replace(phrase, '')


        # Remove fullwidth punctuation (Chinese semicolons and parentheses)
        text = text.replace('；', '').replace('（', '').replace('）', '')

        # Remove extra spaces (including multiple spaces)
        text = ' '.join(text.split())

        # Remove leading/trailing blanks
        text = text.strip()
    return text

# Apply cleaning to both columns
company_operations_df['businessoperations'] = company_operations_df['businessoperations'].apply(clean_text)
company_operations_df['companyname'] = company_operations_df['companyname'].apply(clean_text)

display(company_operations_df.head())

Unnamed: 0,companyname,businessoperations
0,捷益尔上海机器人有限公司,：机器人、机械设备、五金产品、电子产品、计算机软硬件及辅助设备、仪器仪表及上述商品零部件的批...
1,苏州市力克设备制造有限公司,：缝制机械制造皮革、毛皮及其制品加工专用设备制造日用品生产专用设备制造纺织专用设备制造通用设...
2,宁波樱迅机械化工有限公司,机械配件制造、加工医药中间体除化学危险品的制造、加工船舶配件及设备、船用仪器仪表、纺织品、塑...
3,上海浦东尤尼恩皮革制品有限公司,设计、生产皮鞋、皮鞋部件、皮带类皮革制品、配套的五金件，橡胶材质的劳防手套、鞋、充气玩具、积...
4,亚太天津商业保理有限公司,以受让应收账款的方式提供贸易融资应收账款的收付结算、管理与催收销售分户分类账管理客户资信调查...


!! As the null values is only 1.34% of the total records, as of now just dropped them

In [10]:
print("Null values in company_operations_df:")
print(company_operations_df.isnull().sum())

Null values in company_operations_df:
companyname           6998
businessoperations    7268
dtype: int64


## Data Preprocessing

Run the notebook in GPU for faster execution

In [11]:
# Dropping the null records
company_operations_df.dropna(subset=['businessoperations'], inplace=True)

# English stopwords
english_stopwords = set(stopwords.words('english'))

# Basic Chinese stopwords (We can extend this list later)
chinese_stopwords = set([
    '的', '了', '是', '在', '和', '有', '我', '你', '他', '她', '它', '们', '人', '为', '以', '个', '这', '那', '之', '不', '要', '可以',
    '进行', '以及', '等', '并', '向', '在内', '包括', '服务', '业务', '项目', '经营范围', '生产', '销售', '开发', '技术', '产品', '提供', '从事', '相关'
])


def preprocess_text(text):
    """Cleans and tokenizes Chinese/English business operation text."""
    if not isinstance(text, str) or not text.strip():
        return ''

    # Keep only Chinese and English characters
    text = re.sub(r'[^\u4e00-\u9fffA-Za-z]', ' ', text)
    text = text.lower()

    # Tokenize using Jieba
    tokens = jieba.lcut(text)

    # Remove short tokens and stopwords
    tokens = [t for t in tokens if len(t) > 1 and t not in chinese_stopwords and t not in english_stopwords]

    return ' '.join(tokens)

# Apply preprocessing
company_operations_df['businessoperations_processed'] = company_operations_df['businessoperations'].apply(preprocess_text)

# -------------------------------
# Tokenise and create dictionary/corpus
# -------------------------------
tokenized_businessoperations = [
    doc.split() for doc in company_operations_df['businessoperations_processed'].astype(str).tolist()
]

import gensim.corpora as corpora # Import the corpora module

dictionary = corpora.Dictionary(tokenized_businessoperations)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_businessoperations]

print("✅ Sample of tokenized business operations:")
print(tokenized_businessoperations[:5])

print("\n📚 Gensim Dictionary:")
print(dictionary)

print("\n🧮 Sample of Gensim Corpus (Bag-of-Words):")
print(corpus[:5])

print("\nNumber of documents:", len(tokenized_businessoperations))
print("Dictionary size:", len(dictionary))


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.601 seconds.
DEBUG:jieba:Loading model cost 0.601 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


✅ Sample of tokenized business operations:
[['机器人', '机械设备', '五金产品', '电子产品', '计算机', '软硬件', '辅助', '设备', '仪器仪表', '上述', '商品', '零部件', '批发', '进出口', '佣金', '代理', '拍卖', '除外', '配套', '机器人', '机械设备', '研究', '设计', '技术转让', '信息技术', '咨询', '商务信息', '咨询', '企业', '管理', '咨询'], ['缝制', '机械制造', '皮革', '毛皮', '及其', '制品', '加工', '专用设备', '制造', '日用品', '专用设备', '制造', '纺织', '专用设备', '制造', '通用设备', '制造', '特种设备', '制造', '机床', '功能', '部件', '附件', '制造', '智能', '控制系统', '集成', '机械零件', '零部件', '加工', '电子', '专用设备', '纺织', '专用设备', '缝制', '机械', '机械设备', '机床', '功能', '部件', '附件', '软件', '电子', '元器件', '批发', '技术开发', '技术咨询', '交流', '技术转让', '技术推广', '信息技术', '咨询服务', '居住', '房地产', '租赁', '货物', '进出口'], ['机械配件', '制造', '加工', '医药', '中间体', '化学', '危险品', '制造', '加工', '船舶', '配件', '设备', '船用', '仪器仪表', '纺织品', '塑料制品', '文具', '家用电器', '日用品', '批发'], ['设计', '皮鞋', '皮鞋', '部件', '皮带', '皮革制品', '配套', '五金件', '橡胶', '材质', '劳防', '手套', '充气', '玩具', '积木', '塑料', '材质', '文具', '包袋', '打包', '胶带', '涉及', '许可证', '管理', '许可证', '经营'], ['受让', '应收', '账款', '方式', '贸易', '融资', '应收', '账款', '收付', '结算', '管理',

## Modelling

In [None]:
from gensim.models.ldamodel import LdaModel

# Instantiate and train the LDA model
# You can adjust num_topics based on your analysis or requirements
num_topics = 10
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=num_topics,
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto')

# Optional: Print the topics
print("LDA Model Topics:")
print(lda_model.print_topics())

## Sample Translation


In [None]:
!pip install googletrans==4.0.0-rc1

In [None]:
from googletrans import Translator
import pandas as pd

translator = Translator()

# Select a small portion of the DataFrame for testing
# Create a copy to avoid SettingWithCopyWarning
df_sample = df.head(10).copy()

df_sample.loc[:, 'businessoperations_en'] = df_sample['businessoperations'].apply(lambda x: translator.translate(x, src='zh-cn', dest='en').text)

In [None]:
print(df_sample[['businessoperations_en', 'businessoperations']])